In [1]:
import pandas as pd
# Load the Excel file
file_path = 'Dataset_Crystal_Structure.xlsx'  # Replace with the actual path if it's not in the same directory
data = pd.read_excel(file_path)
data = data.rename(columns={'l(A-O)(Å)': 'I(A-O)'})
data = data.rename(columns={'l(B-O)(Å)': 'I(B-O)'})




In [2]:
# Select relevant columns
selected_columns = [
    'Compound', 'A', 'B', 'Lowest distortion', 
    'r(AXII)(Å)', 'r(AVI)(Å)', 'r(BVI)(Å)', 
    'EN(A)', 'EN(B)', 'I(A-O)', 'I(B-O)', 
    'ΔENR', 'tG', 'τ', 'μ'
]
data = data[selected_columns]

# Check for missing values
# print("Missing values per column:\n", data.isnull().sum())

# Drop rows with missing target labels if there are any
data = data.dropna(subset=['Lowest distortion'])

# Separate features and target
X = data.drop(columns=['Compound', 'Lowest distortion', 'τ'])  # 'Compound' is just an identifier, not a feature
y = data['Lowest distortion']  # This is the target variable

# # Encode categorical variables ('A' and 'B' elements) using one-hot encoding
X = pd.get_dummies(X, columns=['A', 'B'], drop_first=True)

# Display the processed data
# print("Features after preprocessing:\n", X.head())
# print("Target variable:\n", y.head())


In [3]:
# Checking for any null values of the target variable Lowest Distortion
columns_with_missing_values = data.columns[data.isnull().any()]
# print(columns_with_missing_values)

In [4]:
# Creating new features based on given columns which will be useful like ionic radius ratio and electronegativity difference
import numpy as np
from sklearn.preprocessing import MinMaxScaler
# Ionic Radius Ratio
X['i_r_ratio'] = X['r(AXII)(Å)'] / X['r(BVI)(Å)']

# Electronegativity Difference
X['elec_diff'] = abs(X['EN(A)'] - X['EN(B)'])

# Scale Numerical Features
scaler = MinMaxScaler()
X[['i_r_ratio', 'elec_diff']] = scaler.fit_transform(X[['i_r_ratio', 'elec_diff']])
# print(X.head(1))


In [5]:
# Training the model
# Importing Libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Create a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the model's performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.7692307692307693

Classification Report:
               precision    recall  f1-score   support

           -       1.00      0.08      0.15        12
       cubic       0.81      0.91      0.86       650
orthorhombic       0.68      0.69      0.68       302
rhombohedral       0.62      0.14      0.23        70
  tetragonal       0.59      0.31      0.41        32

    accuracy                           0.77      1066
   macro avg       0.74      0.43      0.47      1066
weighted avg       0.76      0.77      0.75      1066


Confusion Matrix:
 [[  1   5   4   0   2]
 [  0 592  52   2   4]
 [  0  90 207   4   1]
 [  0  24  36  10   0]
 [  0  17   5   0  10]]
