<a href="https://colab.research.google.com/github/SiddharthGoel/test/blob/master/ModelCreation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
import joblib
import os

print(sklearn.__version__)


# Step 1: Load the data
data = pd.read_csv("https://raw.githubusercontent.com/SiddharthGoel/MushroomClassification/main/mushroom_cleaned.csv")
df = pd.DataFrame(data)

# Step 2: Prepare the feature matrix X and target vector y
cols = df.columns.to_list()
cols.remove('class')
X = df[cols]
y = df["class"]

# Step 3: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

1.5.0


In [2]:
# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(random_state=42))
])

# Define the parameter grid
param_grid = {
    'rf__n_estimators': [50, 75 ,100, 150],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)

# Best model from grid search
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Concatenate predictions to the test dataframe
X_test_with_preds = X_test.copy()
X_test_with_preds['actual'] = y_test
X_test_with_preds['predicted'] = y_pred

# Identify misclassifications
misclassified = X_test_with_preds[X_test_with_preds['actual'] != X_test_with_preds['predicted']]

print("Number of misclassifications:", len(misclassified))
print("Misclassified samples:\n", misclassified.head())

# Analyze features where the model fails
misclassified_summary = misclassified.describe(include='all')
print("Summary of misclassified samples:\n", misclassified_summary)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

Best Parameters: {'rf__max_depth': 30, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 100}
Best Cross-Validation Score: 0.9897519916804771
Number of misclassifications: 117
Misclassified samples:
        cap-diameter  cap-shape  gill-attachment  gill-color  stem-height  \
23099           605          2                6          10     0.071196   
28466           407          5                1           7     0.187438   
26042          1329          5                1          10     0.064090   
909             720          6                0          10     1.885615   
23050           629          6                6          10     0.202503   

       stem-width  stem-color    season  actual  predicted  
23099        1613          11  0.943195       0          1  
28466         709           6  0.888450       0          1  
26042        2451          11  1.804273       1          0  
909          1844          11  0.888450       0          1  
23050        

In [4]:
# Strip 'rf__' from the keys
cleaned_params = {key.split('__')[1]: value for key, value in best_params.items()}
print(cleaned_params)

# Train on the full dataset with the best parameters
best_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('brf', RandomForestClassifier(**cleaned_params, random_state=42))
])

# Fit the pipeline on the full dataset
best_pipeline.fit(X, y)

# Save the best model
model_filename = 'mushroom_classifier_pipeline.pkl'
joblib.dump(best_pipeline, model_filename)
print(f"Pipeline saved to {model_filename}")

{'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Pipeline saved to mushroom_classifier_pipeline.pkl


In [5]:
# Get feature importances
importances = best_pipeline.named_steps['brf'].feature_importances_
feature_importances = pd.DataFrame({
    'feature': X.columns,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print("Feature Importances:\n", feature_importances)

Feature Importances:
            feature  importance
5       stem-width    0.221366
2  gill-attachment    0.150775
3       gill-color    0.128340
0     cap-diameter    0.125613
6       stem-color    0.123516
4      stem-height    0.118625
1        cap-shape    0.093440
7           season    0.038325
