In [None]:
#%pip install ISLP

In [177]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import sklearn.model_selection as skm
from ISLP import load_data, confusion_table
from ISLP.models import ModelSpec as MS
import pandas as pd


from sklearn.tree import (DecisionTreeClassifier as DTC,
                          DecisionTreeRegressor as DTR,
                          plot_tree,
                          export_text)
from sklearn.metrics import (accuracy_score,
                             log_loss)
from sklearn.ensemble import \
     (RandomForestRegressor as RF,
      GradientBoostingRegressor as GBR)
from ISLP.bart import BART

In [178]:
# load the training data train.csv
train_data = pd.read_csv('train.csv')
train_data.head()
print(train_data.shape)
print(train_data.columns)
# Remove some features that are supposed to be not useful for prediction

# Access the 'Name' column data
name_column = train_data['Name']

# check how many different names are there in the name column
name_list = name_column.unique().tolist()
print(f"Number of unique names: {len(name_list)}") #207 => too much, ignore name feature

# drop the Name feature
train_data = train_data.drop(columns=['Name'])
print(train_data.shape)

# check how many different locations are there in the Location column
location_column = train_data['Location']
location_list = location_column.unique().tolist()
print(location_list)
print(f"Number of unique locations: {len(location_list)}") 
train_data = train_data.drop(columns=['Location'])# =12, also drop location feature
print(train_data.shape)

# Drop the feature "New-Price" because there is not many data
train_data = train_data.drop(columns=['New_Price'])
print(train_data.shape) #1491, 12 -> cannot drop more

model = MS(train_data.columns, intercept=False)
D = model.fit_transform(train_data)

def replace_data(column, value):
    data = column
    for i in range(len(column)):
        try:
            data_value = float(column[i])
        except:
            data[i] = str(value)
            continue
        if data[i] == '\\N' or data[i] == '':
            data[i] = str(value)
            continue
    return data

def calculate_average (data):
    sum = 0
    count = 0
    for i in range(len(data)):
        if data[i] != '\\N' and data[i] != '':
            try:
                sum += float(data[i])
                count += 1
            except:
                continue
    average = sum / count
    return average
    

(4470, 16)
Index(['ID', 'Name', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type',
       'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Colour',
       'Seats', 'No. of Doors', 'New_Price', 'Price'],
      dtype='object')
Number of unique names: 207
(4470, 15)
['Coimbatore', 'Kochi', 'Hyderabad', 'Kolkata', 'Bangalore', 'Delhi', 'Pune', 'Chennai', 'Mumbai', 'Ahmedabad', 'Jaipur', '\\N']
Number of unique locations: 12
(4470, 14)
(4470, 13)


In [None]:
# preprocess the data
# calculate the average year
avg_year = int(calculate_average(D['Year']))
print("Average year:", avg_year)

# replace the missing year with the average year
D['Year'] = replace_data(D['Year'], avg_year)  

# convert D['Year'] to int
D['Year'] = pd.to_numeric(D['Year'])

#print(D['Year'])
print(D.shape)

In [None]:
# Preprocess the Kilometers_Driven data
average_km = int(calculate_average(D['Kilometers_Driven']))
print("Average Kilometers_Driven:", average_km)

# replace the missing km with the average km
D['Kilometers_Driven'] = replace_data(D['Kilometers_Driven'], average_km)
D['Kilometers_Driven'] = pd.to_numeric(D['Kilometers_Driven'])# covert to numeric
print(D.shape)

In [None]:
# Fuel type preprocessing
fuel_types = D['Fuel_Type'].unique().tolist() # check how many different fuel types
print(fuel_types)
for fuel in fuel_types: # encode each fuel type into a separate binary feature
    D[f'Fuel_Type_{fuel}'] = D['Fuel_Type'].apply(lambda x: 1 if x == fuel else 0)
D = D.drop(columns=['Fuel_Type']) # drop the original Fuel_Type column
print(D.columns) # print to check
print(D.shape)

In [None]:
# Process the 'Transmission' column
print(D.shape)
transmission_types = D['Transmission'].unique().tolist() # check how many different transmission types
print(transmission_types)
for transmission in transmission_types: # encode each transmission type into a separate binary feature
    D[f'Transmission_{transmission}'] = D['Transmission'].apply(lambda x: 1 if x == transmission else 0)
D = D.drop(columns=['Transmission']) # drop the original Transmission column
print(D.columns) # print to check
print(D['Transmission_Manual']) # print to check
print(D.shape)

In [None]:
#Process the 'Owner_Type' column
owner_types = D['Owner_Type'].unique().tolist() # check how many different owner types
print(owner_types)
for owner in owner_types: # encode each owner type into a separate binary feature
    D[f'Owner_Type_{owner}'] = D['Owner_Type'].apply(lambda x: 1 if x == owner else 0)
D = D.drop(columns=['Owner_Type']) # drop the original Owner_Type column
print(D.columns) # print to check
print(D.shape)

In [None]:
#Preprocess Mileage
print(D['Mileage'])
D['Mileage'] = D['Mileage'].str.split(' ').str[0] # extract the numeric part
mileage_average = calculate_average(D['Mileage'])
print("Average Mileage:", mileage_average)
# replace the missing mileage with the average mileage
D['Mileage'] = replace_data(D['Mileage'], mileage_average)
D['Mileage'] = pd.to_numeric(D['Mileage'])# covert to numeric
print(D.shape)

In [None]:
#Process Engine
print(D['Engine'])
D['Engine'] = D['Engine'].str.split(' ').str[0] # extract the numeric part
# calculate the average engine power
average_engine_volumne = int(calculate_average(D['Engine']))
print("Average Engine Volume:", average_engine_volumne)
# replace the missing engine volume with the average engine volume
D['Engine'] = replace_data(D['Engine'], average_engine_volumne)
D['Engine'] = pd.to_numeric(D['Engine'])# covert to numeric
print(D.shape)

In [None]:
# Process Power
print(D['Power'])
D['Power'] = D['Power'].str.split(' ').str[0] # extract the numeric part
# calculate the average power
average_power = calculate_average(D['Power'])
print("Average Power:", average_power)
# replace the missing power with the average power
D['Power'] = replace_data(D['Power'], average_power)
D['Power'] = pd.to_numeric(D['Power'])# covert to numeric
print(D.shape)

In [None]:
# Process Colour column
color_types = D['Colour'].unique().tolist() # check how many different color types
print(color_types)
for color in color_types: # encode each color type into a separate binary feature
    D[f'Colour_{color}'] = D['Colour'].apply(lambda x: 1 if x == color else 0)
D = D.drop(columns=['Colour']) # drop the original Colour column
print(D.columns) # print to check
print(D.shape)

In [None]:
# Process the Seats column
print(D['Seats'])
# calculate the average seats
average_seats = int(calculate_average(D['Seats']))
print("Average Seats:", average_seats)
# replace the missing seats with the average seats
D['Seats'] = replace_data(D['Seats'], average_seats)
D['Seats'] = pd.to_numeric(D['Seats'])# covert to numeric
print(D.shape)


In [None]:
# Process No. of Doors column
#calculate the average no. of doors
average_doors = int(calculate_average(D['No. of Doors']))
print("Average No. of Doors:", average_doors)
# replace the missing no. of doors with the average no. of doors
D['No. of Doors'] = replace_data(D['No. of Doors'], average_doors)
D['No. of Doors'] = pd.to_numeric(D['No. of Doors'])# covert to numeric

print(D.shape)

In [None]:
# last, we process the response variable Price
#calculate the average price
average_price = calculate_average(D['Price'])
print("Average Price:", average_price)
# replace the missing price with the average price
D['Price'] = replace_data(D['Price'], average_price)
D['Price'] = pd.to_numeric(D['Price'])# covert to numeric
print(D.shape)

In [None]:
# we save processed D to a csv file
D.to_csv('train_processed.csv', index=False)
