<a href="https://colab.research.google.com/github/ShabnaIlmi/Data-Science-Group-Project/blob/recipe-risk-analyzer/DSGP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
!pip install imblearn
!pip install xgboost
!pip install joblib




In [5]:
# Load the dataset
file_path = "chemical_recipe_dataset.csv"  # Replace with the actual file path if necessary
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(data.head())


  Recipe ID                                     Chemical Names  \
0     R0001  Acetone + Hydrogen Peroxide + Sulfuric Acid + ...   
1     R0002              Charcoal + Potassium Nitrate + Sulfur   
2     R0003                         Hydrogen Sulfide + Ammonia   
3     R0004                          Sulfur + Ammonium Nitrate   
4     R0005    Hydrogen Sulfide + Ammonia + Methane + Chlorine   

                  Quantities   Category  \
0  485g + 398g + 275g + 197g  Explosive   
1          465g + 134g + 72g  Explosive   
2                272g + 358g  Corrosive   
3                297g + 304g   Unstable   
4   74g + 376g + 285g + 199g  Corrosive   

                                 Potential Reaction Risk Level  
0   Explosion risk when exposed to heat or friction        Low  
1   Explosion risk when exposed to heat or friction     Medium  
2                    Causes severe burns on contact     Medium  
3  May decompose violently under certain conditions       High  
4                  

In [6]:
# Encode the 'Risk Level' (target variable)
label_encoder = LabelEncoder()
data['Risk Level Encoded'] = label_encoder.fit_transform(data['Risk Level'])

# One-hot encode the 'Chemical Names' and 'Category'
chemical_dummies = data['Chemical Names'].str.get_dummies(sep=' + ')
category_dummies = data['Category'].str.get_dummies(sep=', ')

# Extract numerical values from 'Quantities' (sum the grams for simplicity)
data['Total Quantity (g)'] = data['Quantities'].str.extractall(r'(\d+)').astype(int).groupby(level=0).sum()

# Combine all features
X = pd.concat([chemical_dummies, category_dummies, data['Total Quantity (g)']], axis=1)
y = data['Risk Level Encoded']


In [7]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Split data first
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE only to training data
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f"Original class distribution: {dict(zip(*np.unique(y_train, return_counts=True)))}")
print(f"Balanced class distribution: {dict(zip(*np.unique(y_train_balanced, return_counts=True)))}")


Original class distribution: {0: 272, 1: 270, 2: 258}
Balanced class distribution: {0: 272, 1: 272, 2: 272}


In [8]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [9]:
from sklearn.model_selection import RandomizedSearchCV

# Define the model
rf = RandomForestClassifier(random_state=42)

# Define the parameter grid
param_distributions = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf, param_distributions=param_distributions,
    n_iter=50, cv=3, scoring='accuracy', random_state=42, n_jobs=-1
)
random_search.fit(X_train_balanced, y_train_balanced)

# Best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best Cross-Validation Accuracy:", random_search.best_score_)


Best Parameters: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 10}
Best Cross-Validation Accuracy: 0.36642156862745096


In [10]:
# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Get feature importances and select top features
# Use random_search.best_estimator_ to access the fitted model
importances = random_search.best_estimator_.feature_importances_
important_indices = np.argsort(importances)[-10:]  # Top 10 features
X_selected = X.iloc[:, important_indices]

print("Top Features:", X.columns[important_indices])

print("Model training complete.")

Top Features: Index([' + Chlorine + Hydrogen Sulfide + ',
       ' + Chlorine + Hydrogen Sulfide + Ammonia + ',
       ' + Chlorine + Hydrogen Sulfide + Ammonia + Methane + ',
       ' + Chlorine + Hydrogen Sulfide + Methane + ', 'Flammable', 'Toxic',
       'Corrosive', 'Explosive', 'Unstable', 'Total Quantity (g)'],
      dtype='object')
Model training complete.


In [11]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))


Confusion Matrix:
[[16 26 20]
 [23 18 30]
 [32 21 14]]

Classification Report:
              precision    recall  f1-score   support

           0       0.23      0.26      0.24        62
           1       0.28      0.25      0.26        71
           2       0.22      0.21      0.21        67

    accuracy                           0.24       200
   macro avg       0.24      0.24      0.24       200
weighted avg       0.24      0.24      0.24       200


Accuracy Score: 0.24


In [12]:
import joblib

# Save the trained model to a file
joblib.dump(model, "risk_prediction_model.pkl")
print("Model saved as 'risk_prediction_model.pkl'")


Model saved as 'risk_prediction_model.pkl'


In [13]:
# Example: Predict risk for a new chemical combination
new_data = pd.DataFrame({
    "Chemical Names": ["Ammonium Nitrate + Hydrogen Peroxide"],
    "Category": ["Explosive, Toxic Liquid"],
    "Quantities": ["200g + 100g"]
})

# Preprocess the new data
new_data['Total Quantity (g)'] = new_data['Quantities'].str.extractall(r'(\d+)').astype(int).groupby(level=0).sum()

# Create dummy features matching training data
chemical_features = pd.DataFrame(columns=chemical_dummies.columns)
category_features = pd.DataFrame(columns=category_dummies.columns)

# Fill in the matching dummy values
for col in chemical_features.columns:
    chemical_features.at[0, col] = 1 if col in new_data['Chemical Names'][0].split(" + ") else 0

for col in category_features.columns:
    category_features.at[0, col] = 1 if col in new_data['Category'][0].split(", ") else 0

# Combine all features
new_features = pd.concat([chemical_features, category_features], axis=1)
new_features['Total Quantity (g)'] = new_data['Total Quantity (g)']

# Fill missing columns with zeros to match training data
new_features = new_features.reindex(columns=X.columns, fill_value=0)

# Standardize
new_features_scaled = scaler.transform(new_features)

# Predict
predicted_risk = model.predict(new_features_scaled)
predicted_risk_label = label_encoder.inverse_transform(predicted_risk)

print("Predicted Risk Level:", predicted_risk_label[0])


Predicted Risk Level: Low


In [14]:
import joblib

# Save the trained scaler and label encoder
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")


['label_encoder.pkl']

In [15]:
import pandas as pd
import random
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the previously trained model and preprocessing objects
import joblib
model = joblib.load("risk_prediction_model.pkl")  # Load your trained model
scaler = joblib.load("scaler.pkl")  # Assuming you saved the scaler
label_encoder = joblib.load("label_encoder.pkl")  # Assuming you saved the label encoder

# Define function to process input chemicals and make prediction
def predict_chemical_risk(chemical_names, quantities, categories):
    # Creating the dataframe from the input
    input_data = pd.DataFrame({
        "Chemical Names": [chemical_names],
        "Quantities": [quantities],
        "Category": [categories]
    })

    # Extracting and summing the quantities
    input_data['Total Quantity (g)'] = input_data['Quantities'].str.extractall(r'(\d+)').astype(int).groupby(level=0).sum()

    # Creating feature columns for one-hot encoding
    chemical_features = pd.DataFrame(columns=chemical_dummies.columns)
    category_features = pd.DataFrame(columns=category_dummies.columns)

    # Fill the chemical features based on input data
    for col in chemical_features.columns:
        chemical_features.at[0, col] = 1 if col in chemical_names.split(" + ") else 0

    # Fill the category features based on input data
    for col in category_features.columns:
        category_features.at[0, col] = 1 if col in categories.split(", ") else 0

    # Combine the features
    new_features = pd.concat([chemical_features, category_features], axis=1)
    new_features['Total Quantity (g)'] = input_data['Total Quantity (g)']

    # Fill any missing columns from the training data
    new_features = new_features.reindex(columns=X.columns, fill_value=0)

    # Standardize the new data using the same scaler
    new_features_scaled = scaler.transform(new_features)

    # Predict risk
    predicted_risk = model.predict(new_features_scaled)
    predicted_risk_label = label_encoder.inverse_transform(predicted_risk)

    return predicted_risk_label[0]

# Example usage
chemical_input = input("Enter chemical names (separate by ' + '): ")
quantity_input = input("Enter quantities (separate by ' + '): ")
category_input = input("Enter categories (separate by ', '): ")

predicted_risk = predict_chemical_risk(chemical_input, quantity_input, category_input)

print("Predicted Risk Level:", predicted_risk)


Enter chemical names (separate by ' + '): salt + water
Enter quantities (separate by ' + '): 30g+800g
Enter categories (separate by ', '): liquid
Predicted Risk Level: Medium


In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)  # Scale only for XGBoost
X_test_scaled = scaler.transform(X_test)




In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators=200,
    max_depth=10,
    learning_rate=0.1,
    subsample=0.8,
    random_state=42
)
xgb_model.fit(X_train_scaled, y_train_balanced)  # Use scaled data for XGBoost
y_pred_xgb = xgb_model.predict(X_test_scaled)


from sklearn.metrics import classification_report, accuracy_score
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_distributions,
    n_iter=50, cv=5, scoring='accuracy', n_jobs=-1, random_state=42
)
random_search.fit(X_train_balanced, y_train_balanced)

best_rf = random_search.best_estimator_  # Use best RF model
print("Best Random Forest Parameters:", random_search.best_params_)


In [None]:
xgb_model.set_params(verbosity=0)


In [None]:
import sklearn
print(sklearn.__version__)  # Ensure it's 1.0 or newer
!pip install -U scikit-learn


In [None]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_model = XGBClassifier(n_estimators=100, random_state=42)

rf_model.fit(X_train_balanced, y_train_balanced)
xgb_model.fit(X_train_balanced, y_train_balanced)

voting_clf = VotingClassifier(
    estimators=[('rf', rf_model), ('xgb', xgb_model)],
    voting='soft'
)

voting_clf.fit(X_train_balanced, y_train_balanced)


In [None]:
from sklearn.ensemble import VotingClassifier

# Ensure XGBoost is compatible
xgb_model.set_params(verbosity=0)

# Create Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('rf', best_rf),  # Best Random Forest model
        ('xgb', xgb_model)  # XGBoost
    ],
    voting='soft'
)

# Fit the classifier
voting_clf.fit(X_train_balanced, y_train_balanced)

# Make predictions
y_pred_voting = voting_clf.predict(X_test)


In [None]:
from sklearn.metrics import classification_report, accuracy_score

print("Random Forest Accuracy:", accuracy_score(y_test, best_rf.predict(X_test)))
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Voting Classifier Accuracy:", accuracy_score(y_test, y_pred_voting))

print("\nRandom Forest Report:\n", classification_report(y_test, best_rf.predict(X_test)))
print("\nXGBoost Report:\n", classification_report(y_test, y_pred_xgb))
print("\nVoting Classifier Report:\n", classification_report(y_test, y_pred_voting))
