In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from tqdm import tqdm

# Function to print dataframe info
def print_df_info(df, name):
    print(f"\n{name} Info:")
    print(df.info())
    print(f"\n{name} Head:")
    print(df.head())
    print(f"\n{name} Describe:")
    print(df.describe())

# Step 1: Load and preprocess the data
print("Loading data...")
try:
    # Load the data
    train_data = pd.read_csv('C:/Important Files/Python Projects/Binary Prediction of Poisonous Mushrooms/train.csv')
    test_data = pd.read_csv('C:/Important Files/Python Projects/Binary Prediction of Poisonous Mushrooms/test.csv')
    
    print_df_info(train_data, "Train Data")
    print_df_info(test_data, "Test Data")

    # Separate features and target
    X = train_data.drop(['id', 'class'], axis=1)
    y = train_data['class']

    # Identify categorical columns
    categorical_columns = X.select_dtypes(include=['object']).columns

    print("Preprocessing data...")
    # Encode categorical variables
    print("Encoding categorical variables...")
    le = LabelEncoder()
    for col in tqdm(categorical_columns):
        # Combine train and test data for fitting
        combined = pd.concat([X[col], test_data[col]])
        le.fit(combined)
        
        # Transform train and test data
        X[col] = le.transform(X[col])
        test_data[col] = le.transform(test_data[col])

    # Encode the target variable
    y = (y == 'p').astype(int)  # 'p' for poisonous is 1, 'e' for edible is 0

    # Split the data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    print("Data preprocessing completed.")
    
except Exception as e:
    print(f"An error occurred during data preprocessing: {str(e)}")
    raise

# Step 2: Create and train the model
try:
    print("Creating and training the model...")
    
    # Create a pipeline
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value=-1)),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])

    # Fit the model
    pipeline.fit(X_train, y_train)

    # Perform cross-validation
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)
    print(f"Cross-validation scores: {cv_scores}")
    print(f"Mean CV score: {cv_scores.mean():.4f}")

    # Evaluate on validation set
    val_score = pipeline.score(X_val, y_val)
    print(f"Validation score: {val_score:.4f}")

except Exception as e:
    print(f"An error occurred during model training: {str(e)}")
    raise

# Step 3: Make predictions and create the submission file
try:
    print("Making predictions and creating submission file...")
    # Make predictions on the test set
    test_predictions = pipeline.predict(test_data.drop('id', axis=1))

    # Create submission file
    submission = pd.DataFrame({
        'id': test_data['id'],
        'class': ['p' if pred == 1 else 'e' for pred in test_predictions]
    })

    # Save the submission file
    submission.to_csv('C:/Important Files/Python Projects/Binary Prediction of Poisonous Mushrooms/submission.csv', index=False)

    print("Submission file created successfully!")
    
except Exception as e:
    print(f"An error occurred during prediction or submission file creation: {str(e)}")
    raise

Loading data...

Train Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3116945 entries, 0 to 3116944
Data columns (total 22 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   class                 object 
 2   cap-diameter          float64
 3   cap-shape             object 
 4   cap-surface           object 
 5   cap-color             object 
 6   does-bruise-or-bleed  object 
 7   gill-attachment       object 
 8   gill-spacing          object 
 9   gill-color            object 
 10  stem-height           float64
 11  stem-width            float64
 12  stem-root             object 
 13  stem-surface          object 
 14  stem-color            object 
 15  veil-type             object 
 16  veil-color            object 
 17  has-ring              object 
 18  ring-type             object 
 19  spore-print-color     object 
 20  habitat               object 
 21  season                object 
dtypes: float

100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [00:10<00:00,  1.67it/s]


Data preprocessing completed.
Creating and training the model...
Cross-validation scores: [0.99220993 0.99196729 0.99192518 0.99225403 0.99210565]
Mean CV score: 0.9921
Validation score: 0.9920
Making predictions and creating submission file...
Submission file created successfully!
