# Particle shape dimensionality prediction

In [None]:
import pandas as pd
import numpy as np

from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import cross_val_score, cross_val_predict

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt



## Data preprocessing

In [None]:
file_path = r"dataset-pathway"
data = pd.read_csv(file_path)
print(data.head())

data.isnull().sum()

# drop rows
data = data[data['Dimensionality'].notna()]

# Check the new number of rows and columns
print(data.shape)

## Encode categorical features using OHE

ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
encoded_features = ohe.fit_transform(data[['Method category', 'Solvent type']])
encoded_df = pd.DataFrame(encoded_features, columns=ohe.get_feature_names_out(['Method category', 'Solvent type']))
data = pd.concat([data.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1) # concatenate using reset_index(drop=True)

encoded_columns = ['Method category', 'Solvent type'] # Drop original categorical columns
data = data.drop(columns=encoded_columns)
print(data)

## Impute missing data

exclude_column = ['Dimensionality']
columns_to_impute = [col for col in data.columns if col not in exclude_column]
categorical_features = data[columns_to_impute].select_dtypes(include=['object']).columns
numerical_features = data[columns_to_impute].select_dtypes(include=['number']).columns

imputer = IterativeImputer(max_iter=10, random_state=0, verbose=2)
ohe_columns = encoded_df.columns  # Columns created by OneHotEncoder
data[ohe_columns] = imputer.fit_transform(data[ohe_columns])

data[numerical_features] = imputer.fit_transform(data[numerical_features])

# combine the imputed numerical and categorical data

imputed_data = pd.concat([data[ohe_columns], data[numerical_features], data[['Dimensionality']]], axis=1)
imputed_data.isnull().sum()

# Feature scaling
numerical_features = imputed_data.select_dtypes(include='number')
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(numerical_features)

data_scaled = imputed_data.copy()
data_scaled[numerical_features.columns] = scaled_features
imputed_data = data_scaled
final_data = imputed_data
print(final_data)

# Display the result
print(final_data)

# Train model - 30 times

In [None]:
# train data set
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

# Store results
results = []

# Loop over 30 seeds
for seed in range(30):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=seed)
    model.fit(X_train, y_train)

    # Predict
    y_train_pred = model.predict(X_train)

    # Metrics
    acc = accuracy_score(y_train, y_train_pred)
    prec = precision_score(y_train, y_train_pred, average='weighted', zero_division=0)
    rec = recall_score(y_train, y_train_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_train, y_train_pred, average='weighted', zero_division=0)

    # Store results
    results.append({
        'seed': seed,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1-score': f1
    })

    print(f"Seed {seed} - Acc: {acc:.3f}, Prec: {prec:.3f}, Rec: {rec:.3f}, F1: {f1:.3f}")

# Convert to DataFrame
df_results = pd.DataFrame(results)

# Summary
print("\nSummary of classification metrics over 30 runs:")
print(df_results.describe())

# Save to CSV
df_results.to_csv("RFClassifier_30seeds_metrics_train.csv", index=False)

In [None]:
# Testing data set
X = final_data.drop(columns=['Dimensionality'])
y = final_data['Dimensionality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Store results
results = []

# Loop over 30 seeds
for seed in range(30):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=seed)
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

    # Store results
    results.append({
        'seed': seed,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1-score': f1
    })

    print(f"Seed {seed} - Acc: {acc:.3f}, Prec: {prec:.3f}, Rec: {rec:.3f}, F1: {f1:.3f}")

# Convert to DataFrame
df_results = pd.DataFrame(results)

# Summary
print("\nSummary of classification metrics over 30 runs:")
print(df_results.describe())

# Save to CSV
df_results.to_csv("RFClassifier_30seeds_metrics_test.csv", index=False)

In [None]:
# Confusion matrix

cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Reds")
plt.title("Confusion Matrix")
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.show()

# 10-k fold validation by class

In [None]:
# Classification report by class for training set (10-k fold)
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import classification_report

# Cross-validation scores
scores = cross_val_score(rf, X_train, y_train, cv=10, scoring='accuracy')
print('Cross-Validation scores:', scores)
print('Mean Accuracy:', scores.mean())

# Cross-validated predictions
y_pred = cross_val_predict(rf, X_train, y_train, cv=10)

# Classification report as dictionary
report = classification_report(y_train, y_pred, output_dict=True)

# Print per-class metrics
for label, metrics in report.items():
    if label not in ["accuracy", "macro avg", "weighted avg"]:
        print(f"\nClass: {label}")
        print(f"  Precision: {metrics['precision']:.2f}")
        print(f"  Recall:    {metrics['recall']:.2f}")
        print(f"  F1-score:  {metrics['f1-score']:.2f}")
        print(f"  Support:   {metrics['support']}")

In [None]:
# Classification report by class for testing set (10-k fold)
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import classification_report

# Cross-validation scores
scores = cross_val_score(rf, X_test, y_test, cv=10, scoring='accuracy')
print('Cross-Validation scores:', scores)
print('Mean Accuracy:', scores.mean())

# Cross-validated predictions
y_pred = cross_val_predict(rf, X_test, y_test, cv=10)

# Classification report as dictionary
report = classification_report(y_test, y_pred, output_dict=True)

# Print per-class metrics
for label, metrics in report.items():
    if label not in ["accuracy", "macro avg", "weighted avg"]:
        print(f"\nClass: {label}")
        print(f"  Precision: {metrics['precision']:.2f}")
        print(f"  Recall:    {metrics['recall']:.2f}")
        print(f"  F1-score:  {metrics['f1-score']:.2f}")
        print(f"  Support:   {metrics['support']}")