## Data setup

<div class="alert alert-block alert-warning">
<b>⚠️</b> Don't forget to download the Kaggle token associated with your account from the <a href="https://www.kaggle.com/settings">Settings page</a>
</div>

In [None]:
import os
from pathlib import Path

os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()

data_dir = Path("data")
dataset_zip = data_dir / "playground-series-s5e2.zip"
dataset_train = data_dir / "train.csv"
dataset_extra = data_dir / "training_extra.csv"

if not dataset_train.exists() or not dataset_extra.exists():
    if not dataset_zip.exists():
        print("Dataset zip not found. Downloading from Kaggle...")
        !kaggle competitions download -c playground-series-s5e2
        print("Download complete.")
    else:
        print("Dataset zip already exists.")

    print("Unzipping the dataset...")
    !unzip -o playground-series-s5e2.zip -d data
    !rm playground-series-s5e2.zip
else:
    print("Dataset already exists. Skipping download and extraction.")


## Load data

In [None]:
import pandas as pd

data = pd.read_csv(dataset_train)

data.info()
print(data.head())

## Exploratory Data Analysis

Missing Data

In [None]:
missing_values = pd.DataFrame({
    'Column': data.columns,
    'Missing Train Values': data.isnull().sum().values,
    'Percentage of Missing Train Values': data.isnull().sum().values / len(data) * 100
})

missing_values = missing_values[~missing_values['Column'].isin(['id', 'Price'])]
missing_values

Duplicated Data

In [None]:
data_duplicates = data.drop('id', axis=1).duplicated().sum()
print(f"Data duplicates: {data_duplicates}")

Data Description

In [None]:
data.describe()

Distribution of Data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

categorical_columns = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
numerical_columns = ['Compartments', 'Weight Capacity (kg)', 'Price']

def plot_categorical_columns(data, columns):
    for column in columns:
        plt.figure(figsize=(10, 6))
        sns.countplot(data[column], order=data[column].value_counts().index)
        plt.title(f"Distribution of {column}")
        plt.show()

def plot_numeric_columns(data, columns):
    for column in columns:
        plt.figure(figsize=(10, 6))
        sns.histplot(data[column], bins=30)
        plt.title(f"Distribution of {column}")
        plt.show()

plot_categorical_columns(data, categorical_columns)
plot_numeric_columns(data, numerical_columns)


Data Correlation

In [None]:
%matplotlib inline

for i in range(len(categorical_columns)):
    for j in range(i + 1, len(categorical_columns)):
        plt.figure(figsize=(10, 6))
        sns.countplot(x=categorical_columns[i], hue=categorical_columns[j], data=data)
        plt.title(f"Countplot of {categorical_columns[i]} vs {categorical_columns[j]}")
        plt.show()


Feature Engineering

In [None]:
new_columns = ['IBrand', 'ISize', 'Has Laptop Compartment', 'Is Waterproof', 'IStyle', 'IColor']
final_columns = ['Compartments', 'Weight Capacity (kg)'] + new_columns

def impute_data(df):
    for col in categorical_columns:
        df[col] = df[col].fillna(df[col].mode()[0])
    for col in numerical_columns:
        df[col] = df[col].fillna(df[col].median())

def encode_data(df):
    for col in ['Brand', 'Material', 'Style', 'Color']:
        df['I' + col] = df[col].astype('category').cat.codes

    df['ISize'] = df['Size'].map({'Small': 1, 'Medium': 2, 'Large': 3})

    df['Has Laptop Compartment'] = df['Laptop Compartment'].map({'Yes': 1, 'No': 0})
    df['Is Waterproof'] = df['Waterproof'].map({'Yes': 1, 'No': 0})

impute_data(data)
encode_data(data)

## Encode categorical features and define features and targets

In [None]:
X = data[final_columns]
y = data['Price']

print(X.head())
print(y.head())

In [None]:
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
import math

model = DecisionTreeRegressor(random_state=1, max_depth=5)
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
mse = -cross_val_score(model, X, y, cv=10, scoring=mse_scorer)

print(f"Avg Root Mean Squared Error: {math.sqrt(np.mean(mse)):.2f}")
print(f"Min Root Mean Squared Error: {math.sqrt(np.min(mse)):.2f}")

In [None]:
# import matplotlib.pyplot as plt
# from sklearn.tree import plot_tree

# plt.figure(figsize=(100, 20))
# plot_tree(model, filled=True, feature_names=X.columns, fontsize=10, max_depth=5)
# plt.title("Decision Tree Visualization")
# plt.show()
