# Backpack Price Category Analysis

This notebook demonstrates how to perform exploratory data analysis, handle missing values, perform price category binning, and prepare features for supervised classification.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PriceClassification import price_classification
from ExploratoryDA import run_exploratory_da
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import numpy as np
from IPython.display import display

# 1. Data Loading
Load the datasets and take a first look at their structure and the distribution of the target variable and categorical features.

In [None]:
# Load datasets
train = pd.read_csv('Data/train.csv')
train_extra = pd.read_csv('Data/training_extra.csv')
test = pd.read_csv('Data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Training extra shape: {train_extra.shape}")
print(f"Test shape: {test.shape}")

# Show first few rows for a quick glance at the data
print("\nTrain sample:")
display(train.head())
# print("\nTraining Extra sample:")
# display(train_extra.head())

# Show value counts for all categorical features (before cleaning)
def show_categorical_counts(df, name):
    cat_cols = [col for col in df.columns if df[col].dtype == 'object']
    print(f"\n{name} categorical value counts (before cleaning):")
    for col in cat_cols:
        print(f"\n{col} value counts:")
        print(df[col].value_counts())
        print(f"Unique categories: {df[col].nunique()}")

show_categorical_counts(train, "Train")
# show_categorical_counts(train_extra, "Training Extra")

## 2. Cleaning and Imputation
Remove rows with more than one missing value (for train and training_extra), then fill missing values. Show how many rows are dropped and how many values are filled.

In [None]:
# Cleaning and Imputation
counter = 0
def clean_and_impute(df, drop_rows_with_many_missing=True, name="dataset"):
    global counter
    counter += 1
    print(f"[Call #{counter}] Processing {name}...")
    df_clean = df.copy()
    n_before = df_clean.shape[0]
    if drop_rows_with_many_missing:
        missing_rows = df_clean.isnull().sum(axis=1)
        n_dropped = (missing_rows > 1).sum()
        print(f"{name}: Dropping {n_dropped} rows with >1 missing value.")
        df_clean = df_clean[missing_rows <= 1]
    else:
        n_dropped = 0
    n_filled = 0
    for col in df_clean.columns:
        n_missing = df_clean[col].isnull().sum()
        if n_missing > 0:
            if df_clean[col].dtype == 'object':
                mode_val = df_clean[col].mode(dropna=True)
                if not mode_val.empty:
                    df_clean[col] = df_clean[col].fillna(mode_val[0])
            else:
                median_val = df_clean[col].median(skipna=True)
                df_clean[col] = df_clean[col].fillna(median_val)
            n_filled += n_missing
    print(f"{name}: Filled {n_filled} missing values.")
    print(f"{name}: Final shape after cleaning: {df_clean.shape}")
    return df_clean

train_clean = clean_and_impute(train, drop_rows_with_many_missing=True, name="Train")
# train_extra_clean = clean_and_impute(train_extra, drop_rows_with_many_missing=True, name="Training Extra")

print("\nTrain (cleaned) sample:")
display(train_clean.head())
# print("\nTraining Extra (cleaned) sample:")
# display(train_extra_clean.head())

categorical_columns = [col for col in train_clean.columns if train_clean[col].dtype == 'object' or train_clean[col].dtype.name == 'category']
print("\nTrain (cleaned) categorical value counts:")
for col in categorical_columns:
    print(f"\n{col} value counts:")
    print(train_clean[col].value_counts())
    print(f"Unique categories: {train_clean[col].nunique()}")

# categorical_columns_extra = [col for col in train_extra_clean.columns if train_extra_clean[col].dtype == 'object' or train_extra_clean[col].dtype.name == 'category']
# print("\nTraining Extra (cleaned) categorical value counts:")
# for col in categorical_columns_extra:
#     print(f"\n{col} value counts:")
#     print(train_extra_clean[col].value_counts())
#     print(f"Unique categories: {train_extra_clean[col].nunique()}")

In [None]:
# Save cleaned data for binning function
train_clean.to_csv('Data/train_cleaned.csv', index=False)
# train_extra_clean.to_csv('Data/training_extra_cleaned.csv', index=False)

# Bin price for both training sets
train_binned = price_classification([{'file_path': 'Data/train_cleaned.csv'}], bins=5)[0]
# train_extra_binned = price_classification([{'file_path': 'Data/training_extra_cleaned.csv'}], bins=5)[0]

# Save binned data
train_binned.to_csv('Data/train_cleaned_with_categories.csv', index=False)
# train_extra_binned.to_csv('Data/training_extra_cleaned_with_categories.csv', index=False)

print(train_binned['Price_Category'].value_counts())
# print(train_extra_binned['Price_Category'].value_counts())

# 4. Price Binning and Distribution Plots
Bin the price for both training sets and show the distribution of the resulting price categories. This helps visualize class balance for classification.

In [None]:
# ...existing code for binning...
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(7,4))
sns.countplot(x='Price_Category', data=train_binned, order=sorted(train_binned['Price_Category'].unique()))
plt.title('Train: Price Category Distribution')
plt.xlabel('Price Category')
plt.ylabel('Count')
plt.show()

# plt.figure(figsize=(7,4))
# sns.countplot(x='Price_Category', data=train_extra_binned, order=sorted(train_extra_binned['Price_Category'].unique()))
# plt.title('Training Extra: Price Category Distribution')
# plt.xlabel('Price Category')
# plt.ylabel('Count')
# plt.show()

In [None]:
# Select which training set to use:
# Uncomment only one of the following two lines:
X_train = train_binned.copy()  # Use main train set
# X_train = train_extra_binned.copy()  # Use training_extra set

target_col = 'Price_Category'
categorical_cols = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
numerical_cols = ['Compartments', 'Weight Capacity (kg)']

X = X_train[categorical_cols + numerical_cols]
y = X_train[target_col]

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(drop=None, sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
], remainder='drop')

X_processed = preprocessor.fit_transform(X)
cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
num_features = numerical_cols
feature_names = list(cat_features) + num_features
X_processed_df = pd.DataFrame(X_processed, columns=feature_names)

print('Processed feature shape:', X_processed_df.shape)

# 5. Model Training, Evaluation, and Comparison

We now train and evaluate multiple supervised learning models using only the cleaned and binned train.csv. We compare results using 80/20 split, 80/10/10 split, and 5-fold cross-validation. Metrics include accuracy, precision, recall, F1-score, confusion matrix, and timing. Results are summarized in tables and plots.

In [None]:
import time
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
import matplotlib.pyplot as plt

# Prepare features and target
target_col = 'Price_Category'
categorical_cols = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
numerical_cols = ['Compartments', 'Weight Capacity (kg)']

X = train_binned[categorical_cols + numerical_cols]
y = train_binned[target_col]

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(drop=None, sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
], remainder='drop')

X_processed = preprocessor.fit_transform(X)
cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
num_features = numerical_cols
feature_names = list(cat_features) + num_features
X_processed_df = pd.DataFrame(X_processed, columns=feature_names)

models = {
    # 'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    # 'Decision Tree': DecisionTreeClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    # 'k-NN': KNeighborsClassifier()
}

results = []

# 80/20 Split
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name, split_name):
    start = time.time()
    model.fit(X_train, y_train)
    fit_time = time.time() - start
    start = time.time()
    y_pred = model.predict(X_test)
    pred_time = time.time() - start
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    cm = confusion_matrix(y_test, y_pred)
    results.append({
        'Model': model_name,
        'Split': split_name,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1': f1,
        'Fit Time (s)': fit_time,
        'Pred Time (s)': pred_time
    })
    print(f"\n{model_name} ({split_name}) Confusion Matrix:")
    print(cm)
    return y_pred, cm

# 80/20 split
X_tr, X_te, y_tr, y_te = train_test_split(X_processed_df, y, test_size=0.2, random_state=42, stratify=y)
for name, model in models.items():
    evaluate_model(model, X_tr, X_te, y_tr, y_te, name, '80/20')

# 80/10/10 split (train/val/test)
X_temp, X_test, y_temp, y_test = train_test_split(X_processed_df, y, test_size=0.1, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1111, random_state=42, stratify=y_temp)  # 0.1111*0.9 ≈ 0.1
for name, model in models.items():
    print(f"\n{name} (80/10/10 split)")
    evaluate_model(model, X_train, X_val, y_train, y_val, name, '80/10/10 (val)')
    evaluate_model(model, X_train, X_test, y_train, y_test, name, '80/10/10 (test)')

# 5-fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for name, model in models.items():
    accs, precs, recs, f1s, fit_times, pred_times = [], [], [], [], [], []
    for train_idx, test_idx in kf.split(X_processed_df, y):
        X_tr, X_te = X_processed_df.iloc[train_idx], X_processed_df.iloc[test_idx]
        y_tr, y_te = y.iloc[train_idx], y.iloc[test_idx]
        start = time.time()
        model.fit(X_tr, y_tr)
        fit_time = time.time() - start
        start = time.time()
        y_pred = model.predict(X_te)
        pred_time = time.time() - start
        accs.append(accuracy_score(y_te, y_pred))
        precs.append(precision_score(y_te, y_pred, average='weighted', zero_division=0))
        recs.append(recall_score(y_te, y_pred, average='weighted', zero_division=0))
        f1s.append(f1_score(y_te, y_pred, average='weighted', zero_division=0))
        fit_times.append(fit_time)
        pred_times.append(pred_time)
    results.append({
        'Model': name,
        'Split': '5-fold CV',
        'Accuracy': np.mean(accs),
        'Precision': np.mean(precs),
        'Recall': np.mean(recs),
        'F1': np.mean(f1s),
        'Fit Time (s)': np.mean(fit_times),
        'Pred Time (s)': np.mean(pred_times)
    })

# Summarize results
df_results = pd.DataFrame(results)
display(df_results)

# Plot results
plt.figure(figsize=(10,6))
sns.barplot(data=df_results, x='Model', y='Accuracy', hue='Split')
plt.title('Model Accuracy by Split')
plt.ylabel('Accuracy')
plt.show()

plt.figure(figsize=(10,6))
sns.barplot(data=df_results, x='Model', y='F1', hue='Split')
plt.title('Model F1-score by Split')
plt.ylabel('F1-score')
plt.show()