## Data setup

<div class="alert alert-block alert-warning">
<b>⚠️</b> Don't forget to download the Kaggle token associated with your account from the <a href="https://www.kaggle.com/settings">Settings page</a>
</div>

In [None]:
import os
from pathlib import Path

os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()

!kaggle competitions download -c playground-series-s5e2
data_dir = Path("data")
data_dir.mkdir(parents=True, exist_ok=True)
!mv playground-series-s5e2.zip data/
os.chdir(data_dir)
!unzip -o playground-series-s5e2.zipnd-series-s5e2.zip
!rm playground-series-s5e2.zip
!ls -lh
os.chdir("..")


## Load data

In [None]:
import pandas as pd

train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
data = pd.concat([train_data, test_data], axis=0, ignore_index=True)

data.info()
print(data.head())

## Exploratory Data Analysis

Missing Data

In [None]:
missing_values_train = pd.DataFrame({
    'Column': train_data.columns,
    'Missing Train Values': train_data.isnull().sum().values,
    'Percentage of Missing Train Values': train_data.isnull().sum().values / len(train_data) * 100
})

missing_values_test = pd.DataFrame({
    'Column': test_data.columns,
    'Missing Test Values': test_data.isnull().sum().values,
    'Percentage of Missing Test Values': test_data.isnull().sum().values / len(test_data) * 100
})

merged_missing_values = pd.merge(missing_values_train, missing_values_test, on='Column', how='outer')
merged_missing_values = merged_missing_values[~merged_missing_values['Column'].isin(['id', 'Price'])]
merged_missing_values

Duplicated Data

In [None]:
train_data_duplicates = train_data.drop('id', axis=1).duplicated().sum()
test_data_duplicates = test_data.drop('id', axis=1).duplicated().sum()
print(f"Train data duplicates: {train_data_duplicates}")
print(f"Test data duplicates: {test_data_duplicates}")

Data Description

In [None]:
# pd.concat([train_data.describe(), test_data.describe()], axis=1, keys=['Train', 'Test'])
data.describe()

Distribution of Data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def plot_text_columns(data, columns):
    for column in columns:
        plt.figure(figsize=(10, 6))
        sns.countplot(data[column], order=data[column].value_counts().index)
        plt.title(f"Distribution of {column}")
        plt.show()

def plot_numeric_columns(data, columns):
    for column in columns:
        plt.figure(figsize=(10, 6))
        sns.histplot(data[column], bins=30)
        plt.title(f"Distribution of {column}")
        plt.show()

plot_text_columns(data, data.columns[data.dtypes == 'object'].tolist())
plot_numeric_columns(data, [col for col in data.columns if col != 'id' and data[col].dtypes != 'object'])

def plot_correlation_matrix(data):
    plt.figure(figsize=(12, 10))
    corr = data.corr()
    mask = np.triu(np.ones_like(corr, dtype=bool))
    sns.heatmap(corr, mask=mask, cmap='coolwarm', annot=True, fmt=".2f", square=True)
    plt.title("Correlation Matrix")
    plt.show()
plot_correlation_matrix(data)

Data Imputation

In [None]:
categorical_columns = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
numerical_columns = ['Weight Capacity (kg)']

def impute_data(df):
    for col in categorical_columns:
        df[col] = df[col].fillna(df[col].mode()[0])
    for col in numerical_columns:
        df[col] = df[col].fillna(df[col].median())

impute_data(train_data)
impute_data(test_data)

Data Encoding

In [None]:
new_columns = ['IBrand', 'IMaterial', 'ISize', 'Has Laptop Compartment', 'Is Waterproof', 'IStyle', 'IColor']

def encode_data(df):
    for col in ['Brand', 'Material', 'Style', 'Color']:
        df['I' + col] = df[col].astype('category').cat.codes

    df['ISize'] = df['Size'].map({'Small': 1, 'Medium': 2, 'Large': 3})

    df['Has Laptop Compartment'] = df['Laptop Compartment'].map({'Yes': 1, 'No': 0})
    df['Is Waterproof'] = df['Waterproof'].map({'Yes': 1, 'No': 0})

encode_data(train_data)
encode_data(test_data)

In [None]:
train_data.head()
test_data.head()

Data Correlation

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

numeric_cols = ['Brand', 'Material', 'Price']

df_numeric = train_data[numeric_cols].dropna()

sns.pairplot(df_numeric)


## Encode categorical features and define features and targets

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

input_columns = ['IBrand', 'IMaterial', 'ISize', 'Compartments', 'Has Laptop Compartment', 'Is Waterproof', 'IStyle', 'IColor', 'Weight Capacity (kg)']

X = train_data[input_columns]
y = train_data['Price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=4)

model = DecisionTreeRegressor(random_state=32, max_depth=5)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {mse**0.5:.2f}")

In [None]:
from sklearn.tree import export_graphviz
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
plt.figure(figsize=(100, 20))
plot_tree(model, filled=True, feature_names=X.columns, fontsize=10, max_depth=5)
plt.title("Decision Tree Visualization")
plt.show()


In [None]:
# get all the values from hte Brand column
brand_values = train_data['IBrand'].unique()
# get average price per brand
avg_price_per_brand = train_data.groupby('IBrand')['Price'].mean().sort_values(ascending=False)
avg_price_per_brand
