# KaggleX Challenge Notebook

This notebook holds the code for the KaggleX Challenge. The challenge is to predict the price of used cars based on the given features. The dataset is taken from the Kaggle competition [here](https://www.kaggle.com/competitions/kagglex-cohort4/data).

In [78]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer

In [101]:
# Load the data
train = pd.read_csv('../data/train.csv', index_col='id')
test = pd.read_csv('../data/test.csv', index_col='id')
sample_submission = pd.read_csv('../data/sample_submission.csv', index_col='id')

# Check for missing values
train_missing = train.isnull().sum()
test_missing = test.isnull().sum()
print(f'Missing values in train: {train_missing.sum()}')
print(f'Missing values in test: {test_missing.sum()}')

# Check uniques
train_one_unique = train.columns[train.nunique() == 1]
test_one_unique = test.columns[test.nunique() == 1] 
print(f'Columns with one unique value in train: {train_one_unique.values}')
print(f'Columns with one unique value in test: {test_one_unique.values}') 

# Drop columns with one unique value (clean_title)
train = train.drop(train_one_unique, axis=1)
test = test.drop(test_one_unique, axis=1)

# Drop engine, colors and model features as too many unique values (not useful for model)
train = train.drop(['engine', 'model', 'ext_col', 'int_col'], axis=1)
test = test.drop(['engine', 'model', 'ext_col', 'int_col'], axis=1)

train.head(3)


Missing values in train: 0
Missing values in test: 0
Columns with one unique value in train: ['clean_title']
Columns with one unique value in test: ['clean_title']


Unnamed: 0_level_0,brand,model_year,milage,fuel_type,transmission,accident,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Ford,2018,74349,Gasoline,10-Speed A/T,None reported,11000
1,BMW,2007,80000,Gasoline,6-Speed M/T,None reported,8250
2,Jaguar,2009,91491,Gasoline,6-Speed A/T,None reported,15000


In [92]:
# Check if test transmissions are in train
transmissions = test.transmission.unique()

for transmission in transmissions:
    if transmission not in train.transmission.unique():
        print(f'Transmission {transmission} is not in train')

# Check if test fuel types are in train
fuel_types = test['fuel_type'].unique()

for fuel_type in fuel_types:
    if fuel_type not in train['fuel_type'].unique():
        print(f'Fuel type {fuel_type} is not in train')

# Check if test breands are in train
brands = test.brand.unique()

for brand in brands:
    if brand not in train.brand.unique():
        print(f'Brand {brand} is not in train')


Transmission 2 is not in train
Transmission Manual is not in train
Brand Saab is not in train
Brand smart is not in train


In [102]:
# Binarize features with only two unique values (accident)
binary_features = [col for col in train.columns if train[col].nunique() == 2]
for col in binary_features:
    mapper = {train[col].unique()[0]: 0, train[col].unique()[1]: 1}
    train[col] = train[col].replace(mapper)
    test[col] = test[col].replace(mapper)

train.head(3)

Unnamed: 0_level_0,brand,model_year,milage,fuel_type,transmission,accident,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Ford,2018,74349,Gasoline,10-Speed A/T,0,11000
1,BMW,2007,80000,Gasoline,6-Speed M/T,0,8250
2,Jaguar,2009,91491,Gasoline,6-Speed A/T,0,15000


In [108]:
sns.

Unnamed: 0,model_year,milage,accident,price
count,54273.0,54273.0,54273.0,54273.0
mean,2015.091979,72746.175667,0.264902,39218.44
std,5.588909,50469.490448,0.441285,72826.34
min,1974.0,100.0,0.0,2000.0
25%,2012.0,32268.0,0.0,15500.0
50%,2016.0,66107.0,0.0,28000.0
75%,2019.0,102000.0,1.0,45000.0
max,2024.0,405000.0,1.0,2954083.0


In [106]:
# Vectorize train features
vectorizer = DictVectorizer(separator='_', sparse=False)
train_features = vectorizer.fit_transform(train.to_dict(orient='records'))
train_encoded = pd.DataFrame(train_features, columns=vectorizer.feature_names_)
train_encoded.head(3)

Unnamed: 0,accident,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,brand_Cadillac,...,transmission_Automatic CVT,transmission_CVT Transmission,transmission_CVT-F,transmission_F,transmission_M/T,transmission_SCHEDULED FOR OR IN PRODUCTION,transmission_Transmission Overdrive Switch,transmission_Transmission w/Dual Shift Mode,transmission_Variable,transmission_–
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# 