In [4]:
pip install scikit-learn imbalanced-learn



In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Example data creation (replace this with your actual data loading)
np.random.seed(42)  # For reproducibility
data = {
    'vegetable_oils_and_resins': np.random.uniform(0, 50, 1000),
    'methane_and_ethane': np.random.uniform(0, 20, 1000),
    'white_phosphorus': np.random.uniform(0, 2, 1000),
    'sulfur': np.random.uniform(0, 10, 1000),
    'animal_fats': np.random.uniform(0, 40, 1000),

    'human_chemicals': np.random.uniform(0, 5, 1000),
    'nitrogenous_compounds': np.random.uniform(0, 30, 1000),
    'alert': np.random.choice([0, 1], size=1000, p=[0.1, 0.9])  # Imbalanced target
}

df = pd.DataFrame(data)
X = df.drop('alert', axis=1)
y = df['alert']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the model with class weights
rf = RandomForestClassifier(random_state=42)

# Define the pipeline with scaling, SMOTE, and classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize features
    ('smote', SMOTE(random_state=42)),  # Balance classes
    ('classifier', rf)  # Random Forest classifier
])

# Grid search for hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 20, None],
    'classifier__class_weight': [None, 'balanced']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Predict on the test set
y_pred = grid_search.predict(X_test)

# Print classification report and accuracy
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=1))
print("Accuracy:", accuracy_score(y_test, y_pred))

# Check if risk level is high and output a message
high_risk_proportion = np.mean(y_pred)  # Proportion of predictions classified as high risk (class 1)
threshold = 0.5  # Define a threshold to consider the risk level high

if high_risk_proportion > threshold:
    print(f"Alert: High risk level detected! Proportion of high-risk predictions: {high_risk_proportion:.2f}")
else:
    print(f"Risk level is normal. Proportion of high-risk predictions: {high_risk_proportion:.2f}")

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Classification Report:
               precision    recall  f1-score   support

           0       0.14      0.15      0.14        33
           1       0.89      0.88      0.89       267

    accuracy                           0.80       300
   macro avg       0.52      0.52      0.52       300
weighted avg       0.81      0.80      0.81       300

Accuracy: 0.8033333333333333
Alert: High risk level detected! Proportion of high-risk predictions: 0.88
