Step 1: Load and Explore the Mushroom Dataset

In [None]:
import pandas as pd

In [None]:
# Load dataset
file_path = r"D:\DATA SCIENCE\ASSIGNMENTS\17 SVM\SVM\mushroom.csv"
mushroom_df = pd.read_csv(file_path)

In [None]:
# Basic exploration
print("Dataset shape:", mushroom_df.shape)
print("\nColumn names:\n", list(mushroom_df.columns))
print("\nData types:\n", mushroom_df.dtypes)
print("\nMissing values per column:\n", mushroom_df.isnull().sum())

In [None]:
# Preview dataset
print("\nFirst 5 rows:\n", mushroom_df.head())

In [None]:
# Descriptive summary (includes both numeric and categorical)
print("\nSummary statistics:\n", mushroom_df.describe(include='all'))

Step 2: Visualizing Feature Distributions

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# --- Categorical Features ---
cat_features = ['cap_shape', 'cap_surface', 'cap_color', 'odor', 'habitat', 'class']
plt.figure(figsize=(15, 10))
for i, feature in enumerate(cat_features, 1):
    plt.subplot(2, 3, i)
    sns.countplot(x=feature, data=mushroom_df, palette='viridis')
    plt.title(f'Distribution of {feature}')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# --- Numerical Features ---
num_features = ['stalk_height', 'cap_diameter']

In [None]:
# Histograms
plt.figure(figsize=(12, 5))
for i, feature in enumerate(num_features, 1):
    plt.subplot(1, 2, i)
    sns.histplot(mushroom_df[feature], kde=True, color='teal')
    plt.title(f'{feature} Distribution')
plt.tight_layout()
plt.show()

In [None]:
# Boxplots
plt.figure(figsize=(12, 5))
for i, feature in enumerate(num_features, 1):
    plt.subplot(1, 2, i)
    sns.boxplot(x=mushroom_df[feature], color='orange')
    plt.title(f'Boxplot of {feature}')
plt.tight_layout()
plt.show()

In [None]:
"""
feature_correlations.py
Full pipeline to investigate feature correlations (robust loader + debug prints).
Drop this file into your project and run with your venv python.
"""

In [None]:
import os
import sys
import warnings
from itertools import combinations
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency, pointbiserialr

In [None]:
warnings.filterwarnings("ignore")
sns.set(style="whitegrid", font_scale=1.0)

In [None]:
# ---------- User config ----------
CSV_PATH = r"D:\DATA SCIENCE\ASSIGNMENTS\17 SVM\SVM\mushroom.csv"
TARGET_COL = None
OUTPUT_DIR = r"D:\DATA SCIENCE\ASSIGNMENTS\17 SVM\SVM\correlation_outputs"
DROP_THRESHOLD_NUNIQUE = 1
FILLNA_STRATEGY = "mode"
# ---------------------------------

In [None]:
# quick sanity checks
print("Current working directory:", os.getcwd())
print("CSV_PATH (raw):", CSV_PATH)
print("CSV_PATH (absolute):", os.path.abspath(CSV_PATH))
print("CSV_PATH exists?", os.path.exists(CSV_PATH))
print("Readable by current user?", os.access(os.path.abspath(CSV_PATH), os.R_OK))

In [None]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
# ---------- Robust load ----------
def try_read_csv(path):
    """Try several common encodings/separators and return (df, used_params) or raise."""
    attempts = [
        {"sep": ",", "encoding": "utf-8"},
        {"sep": ",", "encoding": "latin1"},
        {"sep": ";", "encoding": "utf-8"},
        {"sep": "\t", "encoding": "utf-8"},
    ]
    last_exc = None
    for params in attempts:
        try:
            df = pd.read_csv(path, **params)
            return df, params
        except Exception as e:
            last_exc = e
    # final fallback: let pandas infer with engine python (slower but forgiving)
    try:
        df = pd.read_csv(path, engine="python")
        return df, {"engine": "python"}
    except Exception as e:
        raise last_exc or e

In [None]:
# If user provided a DataFrame in the environment (rare here), use that
df = globals().get("mushroom_df", None)

In [None]:
if df is None:
    if not os.path.exists(CSV_PATH):
        sys.exit(f"File not found: {os.path.abspath(CSV_PATH)}\nCheck path, spelling, and that the drive is accessible.")
    try:
        df, used = try_read_csv(CSV_PATH)
        print("Loaded CSV successfully with params:", used)
    except PermissionError as pe:
        sys.exit(f"Permission error reading file: {pe}\nCheck file permissions.")
    except Exception as e:
        # show full info to help debugging
        import traceback
        tb = traceback.format_exc()
        sys.exit(f"Failed to read CSV. Last exception:\n{e}\n\nTraceback:\n{tb}")

In [None]:
# Basic confirmation
print("Dataframe shape:", getattr(df, "shape", None))
print("First 5 rows:")
print(df.head().to_string(index=False))
print("\nDataFrame info:")
print(df.info())

In [None]:
# ---------- Basic cleaning ----------
try:
    nunique = df.nunique(dropna=True)
    const_cols = list(nunique[nunique <= DROP_THRESHOLD_NUNIQUE].index)
    if const_cols:
        print(f"Dropping constant / low-variance columns: {const_cols}")
        df = df.drop(columns=const_cols)
except Exception as e:
    print("Warning during dropping constant columns:", e)

In [None]:
# Fill NAs simply (user can adjust)
if FILLNA_STRATEGY == "mode":
    for c in df.columns:
        if df[c].isna().any():
            try:
                df[c].fillna(df[c].mode().iloc[0], inplace=True)
            except Exception:
                df[c].fillna(method="ffill", inplace=True)
elif FILLNA_STRATEGY == "median":
    for c in df.select_dtypes(include=[np.number]).columns:
        if df[c].isna().any():
            df[c].fillna(df[c].median(), inplace=True)

In [None]:
# Automatic dtype coercion for mostly-numeric object columns
for col in df.columns:
    if df[col].dtype == "object":
        coerced = pd.to_numeric(df[col], errors="coerce")
        if coerced.notna().sum() / len(coerced) > 0.6:
            df[col] = coerced

In [None]:
# ---------- Split numeric & categorical ----------
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

In [None]:
print(f"\nNumeric columns ({len(num_cols)}): {num_cols}")
print(f"Categorical columns ({len(cat_cols)}): {cat_cols}")

In [None]:
# ---------- Helpers ----------
def savefig_and_show(fig, fname):
    path = os.path.join(OUTPUT_DIR, fname)
    fig.tight_layout()
    fig.savefig(path, dpi=150)
    print(f"Saved figure to {path}")
    plt.close(fig)

In [None]:
def cramers_v(series_x, series_y):
    confusion = pd.crosstab(series_x, series_y)
    if confusion.size == 0:
        return np.nan
    chi2, p, dof, expected = chi2_contingency(confusion)
    n = confusion.sum().sum()
    if n == 0:
        return np.nan
    phi2 = chi2 / n
    r, k = confusion.shape
    phi2corr = max(0, phi2 - ((k - 1)*(r - 1)) / (n - 1))
    rcorr = r - ((r - 1)**2) / (n - 1)
    kcorr = k - ((k - 1)**2) / (n - 1)
    denom = min(kcorr - 1, rcorr - 1)
    if denom == 0:
        return 0.0
    return np.sqrt(phi2corr / denom)

In [None]:
def correlation_ratio(categories, measurements):
    categories = pd.Series(categories)
    measurements = pd.Series(measurements)
    mask = categories.notna() & measurements.notna()
    categories = categories[mask]
    measurements = measurements[mask]
    if len(measurements) == 0:
        return np.nan
    cat_groups = measurements.groupby(categories)
    mean_total = measurements.mean()
    ss_between = sum([(grp.size * (grp.mean() - mean_total)**2) for _, grp in cat_groups])
    ss_total = ((measurements - mean_total)**2).sum()
    if ss_total == 0:
        return 0.0
    return np.sqrt(ss_between / ss_total)

In [None]:
# ---------- Numeric correlations ----------
if num_cols:
    pearson = df[num_cols].corr(method="pearson")
    spearman = df[num_cols].corr(method="spearman")
    pearson.to_csv(os.path.join(OUTPUT_DIR, "pearson_correlation_matrix.csv"))
    spearman.to_csv(os.path.join(OUTPUT_DIR, "spearman_correlation_matrix.csv"))
    print("Saved numeric correlation matrices (pearson, spearman).")
    fig, ax = plt.subplots(figsize=(max(6, len(num_cols)*0.5), max(4, len(num_cols)*0.5)))
    sns.heatmap(pearson, annot=True, fmt=".2f", cmap="coolwarm", square=False,
                cbar_kws={'shrink': .6}, linewidths=.5)
    ax.set_title("Pearson Correlation (Numeric features)")
    savefig_and_show(fig, "pearson_heatmap.png")
    pairs = []
    for a, b in combinations(num_cols, 2):
        pairs.append((a, b, pearson.loc[a, b]))
    top_abs = sorted(pairs, key=lambda x: -abs(x[2]))[:20]
    top_df = pd.DataFrame(top_abs, columns=["feature_a", "feature_b", "pearson_corr"])
    top_df.to_csv(os.path.join(OUTPUT_DIR, "top_numeric_pairs_by_abs_pearson.csv"), index=False)
    print("Saved top numeric correlated pairs.")
else:
    print("No numeric columns found; skipping numeric correlation.")

In [None]:
# ---------- Categorical vs Categorical (Cramér's V) ----------
if len(cat_cols) >= 2:
    cramers_matrix = pd.DataFrame(index=cat_cols, columns=cat_cols, dtype=float)
    for a, b in combinations(cat_cols, 2):
        v = cramers_v(df[a], df[b])
        cramers_matrix.loc[a, b] = v
        cramers_matrix.loc[b, a] = v
    np.fill_diagonal(cramers_matrix.values, 1.0)
    cramers_matrix = cramers_matrix.fillna(0.0).astype(float)
    cramers_matrix.to_csv(os.path.join(OUTPUT_DIR, "cramers_v_matrix.csv"))
    print("Saved Cramér's V matrix for categorical features.")
    fig, ax = plt.subplots(figsize=(max(6, len(cat_cols)*0.35), max(6, len(cat_cols)*0.35)))
    sns.heatmap(cramers_matrix, annot=True, fmt=".2f", cmap="vlag", linewidths=.3)

    ax.set_title("Cramér's V (Categorical vs Categorical)")
    savefig_and_show(fig, "cramers_v_heatmap.png")
    cat_pairs = []
    for a, b in combinations(cat_cols, 2):
        cat_pairs.append((a, b, cramers_matrix.loc[a, b]))
    top_cat = sorted(cat_pairs, key=lambda x: -x[2])[:30]
    pd.DataFrame(top_cat, columns=["cat_a", "cat_b", "cramers_v"]).to_csv(
        os.path.join(OUTPUT_DIR, "top_categorical_pairs_by_cramers.csv"), index=False)
    print("Saved top categorical pairs by Cramér's V.")
else:
    print("Not enough categorical columns for Cramér's V (need >=2).")

In [None]:
# ---------- Categorical -> Numeric (Correlation ratio) ----------
if cat_cols and num_cols:
    eta_matrix = pd.DataFrame(index=cat_cols, columns=num_cols, dtype=float)
    for c in cat_cols:
        for n in num_cols:
            eta_matrix.loc[c, n] = correlation_ratio(df[c], df[n])
    eta_matrix = eta_matrix.fillna(0.0).astype(float)
    eta_matrix.to_csv(os.path.join(OUTPUT_DIR, "eta_correlation_ratio_matrix.csv"))
    print("Saved correlation ratio (eta) matrix for categorical->numeric.")
    fig, ax = plt.subplots(figsize=(max(6, len(num_cols)*0.5), max(4, len(cat_cols)*0.25)))
    sns.heatmap(eta_matrix, annot=True, fmt=".2f", cmap="YlGnBu", linewidths=.3)
    ax.set_title("Correlation Ratio (categorical -> numeric) η")
    savefig_and_show(fig, "eta_heatmap.png")
    top_eta_rows = []
    for c in cat_cols:
        row = eta_matrix.loc[c].sort_values(ascending=False)[:10]
        for n, val in row.items():
            top_eta_rows.append((c, n, val))
    pd.DataFrame(top_eta_rows, columns=["categorical", "numeric", "eta"]).to_csv(
        os.path.join(OUTPUT_DIR, "top_categorical_to_numeric_eta.csv"), index=False)
    print("Saved categorical -> numeric top explanations (eta).")
else:
    print("Skipping categorical->numeric eta matrix (need both categorical and numeric columns).")

In [None]:
# ---------- Extra: numeric vs binary categorical using point-biserial (if present) ----------
binary_cat = [c for c in cat_cols if df[c].nunique() == 2]
if binary_cat and num_cols:
    pb_list = []
    for c in binary_cat:
        values = pd.Categorical(df[c]).codes
        for n in num_cols:
            try:
                r, p = pointbiserialr(values, df[n])
                pb_list.append((c, n, r, p))
            except Exception:
                pb_list.append((c, n, np.nan, np.nan))
    pb_df = pd.DataFrame(pb_list, columns=["binary_cat", "numeric", "pointbiserial_r", "p_value"])
    pb_df.to_csv(os.path.join(OUTPUT_DIR, "pointbiserial_binary_cat_numeric.csv"), index=False)
    print("Saved point-biserial correlations for binary categorical features.")
else:
    print("No binary categorical columns or no numeric columns found; skipping point-biserial step.")

In [None]:
# ---------- Summary output: top correlations consolidated ----------
summary_rows = []
if num_cols:
    for _, r in top_df.iterrows():
        summary_rows.append({
            "type": "numeric-numeric",
            "a": r['feature_a'],
            "b": r['feature_b'],
            "score": r['pearson_corr']
        })
if len(cat_cols) >= 2:
    for row in top_cat:
        summary_rows.append({
            "type": "cat-cat",
            "a": row[0],
            "b": row[1],
            "score": row[2]
        })
if cat_cols and num_cols:
    for c, n, val in top_eta_rows:
        summary_rows.append({
            "type": "cat->num",
            "a": c,
            "b": n,
            "score": val
        })
summary_df = pd.DataFrame(summary_rows).sort_values(by="score", key=lambda col: col.abs(), ascending=False)
summary_df.to_csv(os.path.join(OUTPUT_DIR, "consolidated_top_correlations.csv"), index=False)
print(f"Saved consolidated top correlations to {os.path.join(OUTPUT_DIR, 'consolidated_top_correlations.csv')}")
print("\nDONE — All outputs are in the folder:", OUTPUT_DIR)

In [None]:
# step2_preprocessing.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load the encoded dataset
file_path = r"D:\DATA SCIENCE\ASSIGNMENTS\17 SVM\SVM\mushroom.csv"
mushroom_df = pd.read_csv(file_path)

In [None]:
# Display shape and first few rows
print("Initial dataset shape:", mushroom_df.shape)
print(mushroom_df.head())

--- Step 1: Encode Categorical Variables ---

In [None]:
# Separate features and target
X = mushroom_df.drop('class', axis=1)
y = mushroom_df['class']

In [None]:
# Encode target label (edible/poisonous)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
# Perform one-hot encoding for categorical predictors
X_encoded = pd.get_dummies(X, drop_first=True)

In [None]:
print("After encoding:")
print("Feature matrix shape:", X_encoded.shape)
print("Target vector shape:", y_encoded.shape)

--- Step 2: Split Dataset ---

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [None]:
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

In [None]:
# --- Save preprocessed data ---
X_train.to_csv(r"D:\DATA SCIENCE\ASSIGNMENTS\17 SVM\SVM\X_train.csv", index=False)
X_test.to_csv(r"D:\DATA SCIENCE\ASSIGNMENTS\17 SVM\SVM\X_test.csv", index=False)
pd.DataFrame(y_train, columns=['class']).to_csv(r"D:\DATA SCIENCE\ASSIGNMENTS\17 SVM\SVM\y_train.csv", index=False)
pd.DataFrame(y_test, columns=['class']).to_csv(r"D:\DATA SCIENCE\ASSIGNMENTS\17 SVM\SVM\y_test.csv", index=False)

In [None]:
print("Preprocessing complete. Encoded and split data saved successfully.")

In [None]:
# step3_visualization.py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load dataset
file_path = r"D:\DATA SCIENCE\ASSIGNMENTS\17 SVM\SVM\mushroom.csv"
mushroom_df = pd.read_csv(file_path)

In [None]:
sns.set(style="whitegrid", palette="Set2")

--- Step 1: Feature Distributions and Relationships ---

In [None]:
# Correct column names for UCI Mushroom dataset
selected_features = ['odor', 'spore_print_color', 'gill_color', 'cap_color', 'habitat']

In [None]:
# Sanity check for columns
print("Columns in dataset:", mushroom_df.columns.tolist())
for feature in selected_features:
    if feature not in mushroom_df.columns:
        print(f"⚠️ Warning: Column '{feature}' not found in dataset!")

In [None]:
plt.figure(figsize=(14, 10))
for i, feature in enumerate(selected_features, 1):
    plt.subplot(2, 3, i)
    sns.countplot(x=feature, hue='class', data=mushroom_df)
    plt.title(f"Distribution of {feature} by Class")
    plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(r"D:\DATA SCIENCE\ASSIGNMENTS\17 SVM\SVM\feature_distributions.png", dpi=150)
plt.show()

In [None]:
# --- Step 2: Class Distribution Visualization ---
plt.figure(figsize=(6, 5))
sns.countplot(x='class', data=mushroom_df, palette='Set1')
plt.title("Class Distribution (Edible vs. Poisonous)")
plt.xlabel("Class")
plt.ylabel("Count")
plt.savefig(r"D:\DATA SCIENCE\ASSIGNMENTS\17 SVM\SVM\class_distribution.png", dpi=150)
plt.show()

In [None]:
"""
step4_svm.py
Task 4: SVM Implementation
- Loads data (either original CSV or pre-split X_train/X_test files if available)
- Encodes categorical variables (one-hot)
- Standardizes features
- Trains SVM with small hyperparameter search
- Evaluates and saves metrics, confusion matrix, and model
"""

In [None]:
import os
import joblib
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)

In [None]:
# ---------- User config ----------
CSV_PATH = r"D:\DATA SCIENCE\ASSIGNMENTS\17 SVM\SVM\mushroom.csv"
OUTPUT_DIR = r"D:\DATA SCIENCE\ASSIGNMENTS\17 SVM\SVM\correlation_outputs"
RANDOM_STATE = 42
TEST_SIZE = 0.2
USE_PRE_SPLIT = False  # If True, we'll load X_train/X_test CSVs if available
# ---------------------------------

In [None]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
def load_data():
    # If pre-split files exist and user chose to use them, load those
    if USE_PRE_SPLIT:
        p = Path(CSV_PATH).parent
        xtrain = p / "X_train.csv"
        xtest = p / "X_test.csv"
        ytrain = p / "y_train.csv"
        ytest = p / "y_test.csv"
        if xtrain.exists() and xtest.exists() and ytrain.exists() and ytest.exists():
            X_train = pd.read_csv(xtrain)
            X_test = pd.read_csv(xtest)
            y_train = pd.read_csv(ytrain).iloc[:, 0].values
            y_test = pd.read_csv(ytest).iloc[:, 0].values
            print("Loaded pre-split X_train/X_test/y_train/y_test from folder.")
            return X_train, X_test, y_train, y_test
        else:
            print("Pre-split files requested but not found; falling back to single CSV load.")

    # Load full CSV, encode, and split
    df = pd.read_csv(CSV_PATH)
    if 'class' not in df.columns:
        raise SystemExit("Target column 'class' not found in CSV.")

    y = df['class'].copy()
    X = df.drop(columns=['class'])

    # Label encode target (assuming 'e'/'p' or 'edible'/'poisonous')
    # Convert to 0/1
    y = y.astype(str)
    if set(y.unique()) <= set(['e', 'p']):
        y_encoded = (y == 'p').astype(int).values  # poisonous=1, edible=0
    else:
        # fallback: map unique values to 0/1 by sorted order
        unique = sorted(y.unique())
        mapping = {unique[0]: 0, unique[1]: 1}
        y_encoded = y.map(mapping).values
        print("Target mapping used:", mapping)

    # One-hot encode features (drop_first=False to preserve full info; scaler handles multicollinearity)
    X_encoded = pd.get_dummies(X, drop_first=False)
    print("Feature matrix after one-hot encoding shape:", X_encoded.shape)

    # train-test split with stratify
    X_train, X_test, y_train, y_test = train_test_split(
        X_encoded, y_encoded, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y_encoded
    )
    return X_train, X_test, y_train, y_test

In [None]:
def train_and_evaluate(X_train, X_test, y_train, y_test):
    # Build pipeline: scaler + SVM
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("svc", SVC(probability=False))
    ])

    # Small grid for C and kernel (keeps run-time reasonable)
    param_grid = {
        "svc__C": [0.1, 1, 5],
        "svc__kernel": ["rbf", "linear"],
        "svc__gamma": ["scale"]  # keep default gamma
    }

    grid = GridSearchCV(pipe, param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=1)
    print("Starting GridSearchCV for SVM (this may take a bit)...")
    grid.fit(X_train, y_train)

    best = grid.best_estimator_
    print("Best params:", grid.best_params_)
    # Predict
    y_pred = best.predict(X_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    cls_report = classification_report(y_test, y_pred, digits=4, zero_division=0)
    cm = confusion_matrix(y_test, y_pred)

    results = {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1_score": f1,
        "classification_report": cls_report,
        "confusion_matrix": cm.tolist()  # convert to list for easy saving
    }

    # Save model
    model_path = os.path.join(OUTPUT_DIR, "svm_best_model.joblib")
    joblib.dump(grid.best_estimator_, model_path)
    print(f"Saved trained model to: {model_path}")

    # Save results
    results_df = pd.DataFrame({
        "metric": ["accuracy", "precision", "recall", "f1_score"],
        "value": [acc, prec, rec, f1]
    })
    results_df.to_csv(os.path.join(OUTPUT_DIR, "svm_metrics_summary.csv"), index=False)
    with open(os.path.join(OUTPUT_DIR, "svm_classification_report.txt"), "w") as f:
        f.write(cls_report)
    pd.DataFrame(cm, index=["actual_0","actual_1"], columns=["pred_0","pred_1"]).to_csv(
        os.path.join(OUTPUT_DIR, "svm_confusion_matrix.csv")
    )
    print("Saved metrics and confusion matrix to output folder.")

    # Print summary
    print("\n--- SVM Evaluation Summary ---")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")
    print("\nClassification report:\n", cls_report)
    print("Confusion matrix:\n", cm)

    return results, grid.best_params_

In [None]:
if __name__ == "__main__":
    X_train, X_test, y_train, y_test = load_data()
    results, best_params = train_and_evaluate(X_train, X_test, y_train, y_test)
    print("\nALL DONE — outputs in:", OUTPUT_DIR)

In [None]:
# step5_visualize_results.py
"""
Visualize SVM classification results.
Saves plots to OUTPUT_DIR.
"""

In [None]:
import os
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_curve, auc, RocCurveDisplay
)
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
# ---------- User config ----------
OUTPUT_DIR = r"D:\DATA SCIENCE\ASSIGNMENTS\17 SVM\SVM\correlation_outputs"
MODEL_PATH = os.path.join(OUTPUT_DIR, "svm_best_model.joblib")
CSV_PATH = r"D:\DATA SCIENCE\ASSIGNMENTS\17 SVM\SVM\mushroom.csv"  # fallback if pre-split not used
USE_PRE_SPLIT = False  # if you saved X_test/y_test CSVs set True and script will try to load them
PLOT_DPI = 150
RANDOM_STATE = 42
# ---------------------------------

In [None]:
os.makedirs(OUTPUT_DIR, exist_ok=True)
sns.set(style="whitegrid")

In [None]:
# ---------- Load model and data ----------
if not os.path.exists(MODEL_PATH):
    raise SystemExit(f"Model not found at {MODEL_PATH}. Run Task 4 to save svm_best_model.joblib first.")

In [None]:
model = joblib.load(MODEL_PATH)
print("Loaded model:", model)

In [None]:
# Load test data (prefer pre-split files if available)
if USE_PRE_SPLIT:
    base = os.path.dirname(CSV_PATH)
    x_test_path = os.path.join(base, "X_test.csv")
    y_test_path = os.path.join(base, "y_test.csv")
    if os.path.exists(x_test_path) and os.path.exists(y_test_path):
        X_test = pd.read_csv(x_test_path)
        y_test = pd.read_csv(y_test_path).iloc[:, 0].values
    else:
        raise SystemExit("Pre-split test files requested but not found.")
else:
    # load full CSV and split here to reproduce same split as Task 4
    df = pd.read_csv(CSV_PATH)
    if 'class' not in df.columns:
        raise SystemExit("Target column 'class' not found in CSV.")
    y = df['class'].astype(str)
    X = df.drop(columns=['class'])
    # encode y to 0/1 same logic as training script
    if set(y.unique()) <= set(['e', 'p']):
        y_encoded = (y == 'p').astype(int).values
    else:
        unique = sorted(y.unique())
        mapping = {unique[0]: 0, unique[1]: 1}
        y_encoded = y.map(mapping).values
    X_encoded = pd.get_dummies(X, drop_first=False)
    # ensure columns line up (if training used same encoding)
    # If model was trained on a different feature set, prefer using saved pre-split CSVs.
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(
        X_encoded, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )

In [None]:
# Convert to numpy arrays
X_test_arr = np.asarray(X_test)
y_test_arr = np.asarray(y_test)

In [None]:
# ---------- Predictions ----------
y_pred = model.predict(X_test_arr)

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test_arr, y_pred)
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

In [None]:
# Save confusion matrix (counts)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=["pred_0","pred_1"], yticklabels=["true_0","true_1"])
plt.ylabel("True")
plt.xlabel("Predicted")
plt.title("Confusion Matrix (counts)")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "svm_confusion_matrix_counts.png"), dpi=PLOT_DPI)
plt.close()

In [None]:
# Save normalized confusion matrix (percent)
plt.figure(figsize=(5,4))
sns.heatmap(cm_norm, annot=True, fmt='.2f', cmap='Blues', cbar=False,
            xticklabels=["pred_0","pred_1"], yticklabels=["true_0","true_1"])
plt.ylabel("True")
plt.xlabel("Predicted")
plt.title("Confusion Matrix (normalized)")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "svm_confusion_matrix_normalized.png"), dpi=PLOT_DPI)
plt.close()

In [None]:
# Classification report heatmap (turn report into a dataframe)
report = classification_report(y_test_arr, y_pred, output_dict=True, zero_division=0)
report_df = pd.DataFrame(report).transpose()
# Save textual report
with open(os.path.join(OUTPUT_DIR, "svm_classification_report.txt"), "w") as f:
    f.write(classification_report(y_test_arr, y_pred, zero_division=0))
report_df.to_csv(os.path.join(OUTPUT_DIR, "svm_classification_report_table.csv"))

In [None]:
# Plot classification report (precision, recall, f1) for classes
metrics_df = report_df.loc[['0','1'], ['precision','recall','f1-score']].astype(float)
plt.figure(figsize=(6,4))
metrics_df.plot(kind='bar')
plt.title("Precision / Recall / F1-score by Class")
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "svm_class_metrics_bar.png"), dpi=PLOT_DPI)
plt.close()

In [None]:
# ---------- ROC curve & AUC ----------
y_score = None
if hasattr(model, "decision_function"):
    try:
        y_score = model.decision_function(X_test_arr)
    except Exception:
        y_score = None

In [None]:
if y_score is None and hasattr(model, "predict_proba"):
    try:
        y_score = model.predict_proba(X_test_arr)[:, 1]
    except Exception:
        y_score = None

In [None]:
if y_score is not None:
    fpr, tpr, _ = roc_curve(y_test_arr, y_score)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(6,5))
    plt.plot(fpr, tpr, lw=2, label=f'AUC = {roc_auc:.4f}')
    plt.plot([0,1], [0,1], linestyle='--', color='gray', linewidth=1)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, "svm_roc_auc.png"), dpi=PLOT_DPI)
    plt.close()
    print("Saved ROC curve. AUC:", roc_auc)
else:
    print("Model does not expose decision_function or predict_proba. ROC curve skipped.")

In [None]:
# ---------- 2D Embeddings (PCA and t-SNE) colored by true/predicted ----------
# PCA (fast)
pca = PCA(n_components=2, random_state=RANDOM_STATE)
try:
    X_2d_pca = pca.fit_transform(X_test_arr)
    df_plot = pd.DataFrame({
        'pc1': X_2d_pca[:,0],
        'pc2': X_2d_pca[:,1],
        'true': y_test_arr,
        'pred': y_pred
    })
    # True labels
    plt.figure(figsize=(6,5))
    sns.scatterplot(data=df_plot, x='pc1', y='pc2', hue='true', style='true', s=40, palette='deep')
    plt.title('PCA 2D - True labels')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, "svm_pca_true_labels.png"), dpi=PLOT_DPI)
    plt.close()

    # Predicted labels
    plt.figure(figsize=(6,5))
    sns.scatterplot(data=df_plot, x='pc1', y='pc2', hue='pred', style='pred', s=40, palette='deep')
    plt.title('PCA 2D - Predicted labels')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, "svm_pca_pred_labels.png"), dpi=PLOT_DPI)
    plt.close()
except Exception as e:
    print("PCA embedding failed:", e)

In [None]:
# t-SNE (slower; sample if too big)
tsne_n = min(2000, X_test_arr.shape[0])  # cap samples for speed
if X_test_arr.shape[0] > tsne_n:
    sample_idx = np.random.RandomState(RANDOM_STATE).choice(X_test_arr.shape[0], size=tsne_n, replace=False)
    X_sample = X_test_arr[sample_idx]
    y_sample = y_test_arr[sample_idx]
    y_pred_sample = y_pred[sample_idx]
else:
    X_sample = X_test_arr
    y_sample = y_test_arr
    y_pred_sample = y_pred

In [None]:
try:
    tsne = TSNE(n_components=2, perplexity=30, random_state=RANDOM_STATE, init='pca')
    X_2d_tsne = tsne.fit_transform(X_sample)
    df_tsne = pd.DataFrame({
        'tsne1': X_2d_tsne[:,0],
        'tsne2': X_2d_tsne[:,1],
        'true': y_sample,
        'pred': y_pred_sample
    })
    plt.figure(figsize=(6,5))
    sns.scatterplot(data=df_tsne, x='tsne1', y='tsne2', hue='true', s=30, palette='tab10')
    plt.title('t-SNE (true labels)')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, "svm_tsne_true_labels.png"), dpi=PLOT_DPI)
    plt.close()

    plt.figure(figsize=(6,5))
    sns.scatterplot(data=df_tsne, x='tsne1', y='tsne2', hue='pred', s=30, palette='tab10')
    plt.title('t-SNE (predicted labels)')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, "svm_tsne_pred_labels.png"), dpi=PLOT_DPI)
    plt.close()
except Exception as e:
    print("t-SNE embedding failed or too slow:", e)

In [None]:
print("All visualizations saved to:", OUTPUT_DIR)

In [None]:
# svm_tuning.py
"""
SVM hyperparameter tuning and evaluation script.

Assumptions:
- You have X (features) and y (labels). If not, uncomment the example using sklearn's iris dataset.
- Recommended: run inside a virtualenv with scikit-learn installed (>=0.24).
"""

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (
    train_test_split, StratifiedKFold,
    GridSearchCV, RandomizedSearchCV
)
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import joblib
import time

---------- USER DATA LOADING ----------
Option A: if you already have X,y (numpy arrays or pandas)
from your_data_module import X, y

In [None]:
# Option B: quick example dataset (uncomment to test script immediately)
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [None]:
# ---------- TRAIN/TEST SPLIT ----------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
# ---------- COMMON PIPELINE ----------
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("svc", SVC(probability=False))
])

In [None]:
# ---------- CROSS-VALIDATION SETUP ----------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# ---------- PARAM GRID FOR GRIDSEARCH ----------
# We separate grids by kernel to keep combinatorial explosion manageable.
param_grid = [
    {
        "svc__kernel": ["linear"],
        "svc__C": [0.01, 0.1, 1, 10, 100],
        "svc__class_weight": [None, "balanced"],
    },
    {
        "svc__kernel": ["rbf"],
        "svc__C": [0.1, 1, 10, 100],
        "svc__gamma": ["scale", "auto", 0.001, 0.01, 0.1, 1],
        "svc__class_weight": [None, "balanced"],
    },
    {
        "svc__kernel": ["poly"],
        "svc__C": [0.1, 1, 10],
        "svc__degree": [2, 3, 4],
        "svc__gamma": ["scale", "auto"],
        "svc__coef0": [0.0, 0.1, 0.5],
        "svc__class_weight": [None],
    },
    {
        "svc__kernel": ["sigmoid"],
        "svc__C": [0.1, 1, 10],
        "svc__gamma": ["scale", "auto", 0.01, 0.1],
        "svc__coef0": [0.0, 0.1, 0.5],
        "svc__class_weight": [None],
    }
]

In [None]:
# ---------- RANDOMIZED GRID (broader search) ----------
# If you have many features / dataset large, randomized search is faster.
param_dist = {
    "svc__kernel": ["rbf", "linear", "poly", "sigmoid"],
    "svc__C": [10**k for k in np.linspace(-3, 3, 11)],      # 1e-3 .. 1e3
    "svc__gamma": ["scale", "auto"]+ [10**k for k in np.linspace(-4, 0, 5)], # mix
    "svc__degree": [2, 3, 4],    # only relevant for poly
    "svc__coef0": [0.0, 0.1, 0.5],
    "svc__class_weight": [None, "balanced"],
}

Note: We defined param_dist above but RandomizedSearchCV will ignore keys that
don't apply to a kernel (e.g., degree for non-poly). That's okay.

In [None]:
# ---------- FUNCTIONS FOR TUNING & EVAL ----------
def run_grid_search(pipe, param_grid, X_train, y_train, cv, scoring="f1_macro", n_jobs=-1):
    print("Starting GridSearchCV...")
    t0 = time.time()
    grid = GridSearchCV(
        pipe, param_grid, cv=cv, scoring=scoring,
        verbose=2, n_jobs=n_jobs, refit=True
    )
    grid.fit(X_train, y_train)
    elapsed = time.time() - t0
    print(f"Grid search done in {elapsed:.1f} s")
    print("Best score (CV):", grid.best_score_)
    print("Best params:", grid.best_params_)
    return grid

In [None]:
def run_random_search(pipe, param_dist, X_train, y_train, cv, n_iter=40, scoring="f1_macro", n_jobs=-1):
    print("Starting RandomizedSearchCV...")
    t0 = time.time()
    rand = RandomizedSearchCV(
        pipe, param_dist, n_iter=n_iter, cv=cv, scoring=scoring,
        verbose=2, n_jobs=n_jobs, random_state=42, refit=True
    )
    rand.fit(X_train, y_train)
    elapsed = time.time() - t0
    print(f"Randomized search done in {elapsed:.1f} s")
    print("Best score (CV):", rand.best_score_)
    print("Best params:", rand.best_params_)
    return rand

In [None]:
def evaluate_model(model, X_test, y_test, target_names=None, save_cm_fig=True, cm_filename="confusion_matrix.png"):
    print("\n--- Test set evaluation ---")
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=target_names))
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
    fig, ax = plt.subplots(figsize=(6, 6))
    disp.plot(ax=ax, cmap=plt.cm.Blues, colorbar=False)
    plt.title("Confusion Matrix (test set)")
    if save_cm_fig:
        plt.savefig(cm_filename, bbox_inches="tight")
        print(f"Saved confusion matrix to {cm_filename}")
    plt.show()

In [None]:
# ---------- RUN TUNING (pick one or both) ----------
if __name__ == "__main__":
    # 1) Quick Randomized Search (broad)
    try:
        rand_search = run_random_search(
            pipe,
            param_dist,
            X_train, y_train,
            cv=cv,
            n_iter=30,      # reduce/increase based on compute budget
            scoring="f1_macro",
            n_jobs=-1
        )
    except Exception as e:
        print("Randomized search failed (likely due to param_dist construction).")
        print(e)
        rand_search = None

    # 2) More exhaustive Grid Search (fine-tuning)
    grid_search = run_grid_search(
        pipe,
        param_grid,
        X_train, y_train,
        cv=cv,
        scoring="f1_macro",
        n_jobs=-1
    )

    # Pick best model (prefer grid_search best if available)
    best_model = None
    if grid_search is not None:
        best_model = grid_search.best_estimator_
        print("\nSelected model from GridSearch.")
    elif rand_search is not None:
        best_model = rand_search.best_estimator_
        print("\nSelected model from RandomizedSearch.")
    else:
        raise RuntimeError("No tuned model available. Both searches failed.")

    # Evaluate on test set
    class_names = [str(c) for c in np.unique(y)]
    evaluate_model(best_model, X_test, y_test, target_names=class_names)

    # Save best model
    joblib.dump(best_model, "best_svm_model.joblib")
    print("Saved best model to best_svm_model.joblib")