In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Step 1: Load Dataset
df = pd.read_csv("mushroom.csv")

# Step 2: Data Cleaning (Removing Byte-Strings if needed)
df = df.replace({r"b'": "", r"'": ""}, regex=True)

# Step 3: Encode Categorical Features (Label Encoding)
label_encoder = LabelEncoder()
df_encoded = df.apply(label_encoder.fit_transform)

# Step 4: Split Data into Features and Target Variable
X = df_encoded.drop(columns=["class"])  # Features
y = df_encoded["class"]  # Target variable (Edible or Poisonous)

# Step 5: Split Dataset into Training (80%) and Testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 6: Define Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "SVM": SVC()
}

# Step 7: Perform Cross-Validation and Evaluate Accuracy
cv_results = {}
k_folds = 5  # Number of cross-validation folds

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=k_folds, scoring='accuracy')
    cv_results[name] = scores.mean()

# Step 8: Display Cross-Validation Results
print("Cross-Validation Performance:")
for model, acc in cv_results.items():
    print(f"{model}: {acc:.4f}")


Cross-Validation Performance:
Logistic Regression: 0.9595
Decision Tree: 1.0000
Random Forest: 1.0000
SVM: 0.9925
