# Materials Discovery with AI Techniques

This notebook demonstrates how to use AI algorithms to predict and discover new materials with specific properties. We'll use clustering and classification techniques to categorize materials based on their features.

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.datasets import make_classification

In [2]:
# Generate synthetic dataset representing material properties
def generate_materials_data(n_samples=10000):
    np.random.seed(42)
    X, y = make_classification(n_samples=n_samples, n_features=10, n_informative=5, n_redundant=2, n_clusters_per_class=1, n_classes=3)
    columns = [f'feature_{i}' for i in range(X.shape[1])]
    df = pd.DataFrame(X, columns=columns)
    df['class'] = y
    return df

# Load and preprocess the dataset
df_materials = generate_materials_data()
X_materials = df_materials.drop('class', axis=1)
y_materials = df_materials['class']

# Split the dataset into training and testing sets
X_materials_train, X_materials_test, y_materials_train, y_materials_test = train_test_split(X_materials, y_materials, test_size=0.3, random_state=42)

# Standardize the features
scaler_materials = StandardScaler()
X_materials_train_scaled = scaler_materials.fit_transform(X_materials_train)
X_materials_test_scaled = scaler_materials.transform(X_materials_test)

In [3]:
# Dimensionality reduction using PCA
pca_materials = PCA(n_components=2)
X_materials_train_pca = pca_materials.fit_transform(X_materials_train_scaled)
X_materials_test_pca = pca_materials.transform(X_materials_test_scaled)

# Visualization of the data using PCA components
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_materials_train_pca[:, 0], y=X_materials_train_pca[:, 1], hue=y_materials_train, palette='viridis', s=50)
plt.title('PCA of Material Properties Data')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Material Class')
plt.show()

In [4]:
# Clustering using KMeans to discover new material classes
kmeans_materials = KMeans(n_clusters=3, random_state=42)
y_materials_train_clusters = kmeans_materials.fit_predict(X_materials_train_pca)
y_materials_test_clusters = kmeans_materials.predict(X_materials_test_pca)

# Visualization of the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_materials_train_pca[:, 0], y=X_materials_train_pca[:, 1], hue=y_materials_train_clusters, palette='viridis', s=50)
plt.title('KMeans Clustering of Material Data')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Cluster')
plt.show()

In [5]:
# Classification using Random Forest to predict material classes
clf_materials = RandomForestClassifier(n_estimators=100, random_state=42)
clf_materials.fit(X_materials_train_scaled, y_materials_train)

# Predictions and evaluation
y_materials_pred = clf_materials.predict(X_materials_test_scaled)
print(classification_report(y_materials_test, y_materials_pred, target_names=['Material Class 1', 'Material Class 2', 'Material Class 3']))
print(confusion_matrix(y_materials_test, y_materials_pred))

In [6]:
# Feature importance
feature_importances_materials = pd.Series(clf_materials.feature_importances_, index=X_materials.columns)
feature_importances_materials.sort_values(ascending=False, inplace=True)

plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances_materials, y=feature_importances_materials.index)
plt.title('Feature Importances in Material Class Prediction')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.show()