In [None]:
#%pip install ISLP

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import sklearn.model_selection as skm
from ISLP import load_data, confusion_table
from ISLP.models import ModelSpec as MS
import pandas as pd


from sklearn.tree import (DecisionTreeClassifier as DTC,
                          DecisionTreeRegressor as DTR,
                          plot_tree,
                          export_text)
from sklearn.metrics import (accuracy_score,
                             log_loss)
from sklearn.ensemble import \
     (RandomForestRegressor as RF,
      GradientBoostingRegressor as GBR)
from ISLP.bart import BART



In [2]:
# load the training data test.csv
test_data = pd.read_csv('test.csv')
test_data.head()
print(test_data.shape)
print(test_data.columns)
# Remove some features that are supposed to be not useful for prediction

# Access the 'Name' column data
name_column = test_data['Name']

# check how many different names are there in the name column
name_list = name_column.unique().tolist()
print(f"Number of unique names: {len(name_list)}") #207 => too much, ignore name feature

# drop the Name feature
test_data = test_data.drop(columns=['Name'])
print(test_data.shape)

# check how many different locations are there in the Location column
location_column = test_data['Location']
location_list = location_column.unique().tolist()
print(location_list)
print(f"Number of unique locations: {len(location_list)}") 
test_data = test_data.drop(columns=['Location'])# =12, also drop location feature
print(test_data.shape)

# Drop the feature "New-Price" because there is not many data
test_data = test_data.drop(columns=['New_Price'])
print(test_data.shape)



(1491, 15)
Index(['ID', 'Name', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type',
       'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Colour',
       'Seats', 'No. of Doors', 'New_Price'],
      dtype='object')
Number of unique names: 161
(1491, 14)
['Pune', 'Chennai', '\\N', 'Mumbai', 'Coimbatore', 'Delhi', 'Bangalore', 'Kolkata', 'Jaipur', 'Ahmedabad', 'Hyderabad', 'Kochi']
Number of unique locations: 12
(1491, 13)
(1491, 12)


In [3]:
# clean the data
# loop for all columns to remove rows with NaN values
model = MS(test_data.columns, intercept=False)
D = model.fit_transform(test_data)
D_raw = D.copy()
print(D.shape)
nan_index = []
for col in D.columns:
    for i in range(D.shape[0]):
        if D[col][i] == '\\N':
            nan_index.append(i)
nan_index = list(set(nan_index)) # remove duplicates
print(len(nan_index))
# remove the rows that have NaN in any column
for idx in nan_index:
    D = D.drop(index=idx)
print(D.shape)

(1491, 12)
21
(1470, 12)


In [4]:
# Preprocess the year data
D['Year'] = pd.to_numeric(D['Year'])# covert to numeric
# normalize the year data to 0.5 to 1.5
min_year = D['Year'].min()*np.ones(D.shape[0])
max_year = D['Year'].max()*np.ones(D.shape[0])

print(min_year[0], max_year[0])

#normalize to 0.5 to 1.5
D['Year'] = 0.5*np.ones(D.shape[0]) + (D['Year'] - min_year) / (max_year - min_year)
print(D['Year'].min(), D['Year'].max())

print(D['Year'])
print(D.shape)

1998.0 2019.0
0.5 1.5
0       1.166667
1       0.976190
3       1.404762
4       1.166667
5       1.261905
          ...   
1486    1.119048
1487    1.119048
1488    1.261905
1489    1.119048
1490    1.404762
Name: Year, Length: 1470, dtype: float64
(1470, 12)


In [5]:
# Preprocess the Kilometers_Driven data
D['Kilometers_Driven'] = pd.to_numeric(D['Kilometers_Driven'])# covert to numeric
print(D['Kilometers_Driven'].min(), D['Kilometers_Driven'].max())
# normalize the Kilometers_Driven data to 0.5 to 1.5
min_km = D['Kilometers_Driven'].min()*np.ones(D.shape[0])
max_km = D['Kilometers_Driven'].max()*np.ones(D.shape[0])

print(min_km[0], max_km[0])

#normalize to 0.5 to 1.5
D['Kilometers_Driven'] = 0.5*np.ones(D.shape[0]) + (D['Kilometers_Driven'] - min_km) / (max_km - min_km)
print(D['Kilometers_Driven'])
# print(D_raw['Kilometers_Driven'])

1000 720000
1000.0 720000.0
0       0.586787
1       0.622392
3       0.543115
4       0.606096
5       0.623143
          ...   
1486    0.609875
1487    0.557719
1488    0.550904
1489    0.600139
1490    0.518940
Name: Kilometers_Driven, Length: 1470, dtype: float64


In [6]:
# Fuel type preprocessing
fuel_types = D['Fuel_Type'].unique().tolist() # check how many different fuel types
print(fuel_types)
for fuel in fuel_types: # encode each fuel type into a separate binary feature
    D[f'Fuel_Type_{fuel}'] = D['Fuel_Type'].apply(lambda x: 1 if x == fuel else 0)
D = D.drop(columns=['Fuel_Type']) # drop the original Fuel_Type column
print(D.columns) # print to check
# print(D['Fuel_Type_Diesel']) # print to check

['Diesel', 'Petrol', 'CNG', 'LPG']
Index(['ID', 'Year', 'Kilometers_Driven', 'Transmission', 'Owner_Type',
       'Mileage', 'Engine', 'Power', 'Colour', 'Seats', 'No. of Doors',
       'Fuel_Type_Diesel', 'Fuel_Type_Petrol', 'Fuel_Type_CNG',
       'Fuel_Type_LPG'],
      dtype='object')


In [7]:
# Process the 'Transmission' column
print(D.shape)
transmission_types = D['Transmission'].unique().tolist() # check how many different transmission types
print(transmission_types)
for transmission in transmission_types: # encode each transmission type into a separate binary feature
    D[f'Transmission_{transmission}'] = D['Transmission'].apply(lambda x: 1 if x == transmission else 0)
D = D.drop(columns=['Transmission']) # drop the original Transmission column
print(D.columns) # print to check
print(D['Transmission_Manual']) # print to check
print(D.shape)

(1470, 15)
['Manual', 'Automatic']
Index(['ID', 'Year', 'Kilometers_Driven', 'Owner_Type', 'Mileage', 'Engine',
       'Power', 'Colour', 'Seats', 'No. of Doors', 'Fuel_Type_Diesel',
       'Fuel_Type_Petrol', 'Fuel_Type_CNG', 'Fuel_Type_LPG',
       'Transmission_Manual', 'Transmission_Automatic'],
      dtype='object')
0       1
1       0
3       0
4       1
5       0
       ..
1486    1
1487    0
1488    0
1489    1
1490    1
Name: Transmission_Manual, Length: 1470, dtype: int64
(1470, 16)


In [8]:
#Process the 'Owner_Type' column
owner_types = D['Owner_Type'].unique().tolist() # check how many different owner types
print(owner_types)
for owner in owner_types: # encode each owner type into a separate binary feature
    D[f'Owner_Type_{owner}'] = D['Owner_Type'].apply(lambda x: 1 if x == owner else 0)
D = D.drop(columns=['Owner_Type']) # drop the original Owner_Type column
# print(D.columns) # print to check
# print(D['Owner_Type_Second']) # print to check


['First', 'Second', 'Third', 'Fourth & Above']


In [9]:
print(D['Owner_Type_Second']) # print to check
print(D.shape)

0       0
1       1
3       0
4       0
5       0
       ..
1486    0
1487    0
1488    1
1489    0
1490    0
Name: Owner_Type_Second, Length: 1470, dtype: int64
(1470, 19)


In [10]:
#Preprocess Mileage
print(D['Mileage'])
D['Mileage'] = D['Mileage'].str.split(' ').str[0] # extract the numeric part
D['Mileage'] = pd.to_numeric(D['Mileage'])# covert to numeric
print(D['Mileage'])
# normalize the Mileage data to 0.5 to 1.5
min_mileage = D['Mileage'].min()*np.ones(D.shape[0])
max_mileage = D['Mileage'].max()*np.ones(D.shape[0])
print(min_mileage[0], max_mileage[0])
#normalize to 0.5 to 1.5
D['Mileage'] = 0.5*np.ones(D.shape[0]) + (D['Mileage'] - min_mileage) / (max_mileage - min_mileage)
print(D['Mileage'])

0        17.8 kmpl
1       16.07 kmpl
3       14.84 kmpl
4        17.0 kmpl
5       12.55 kmpl
           ...    
1486    13.93 kmpl
1487    18.33 kmpl
1488    16.55 kmpl
1489    12.05 kmpl
1490     18.6 kmpl
Name: Mileage, Length: 1470, dtype: object
0       17.80
1       16.07
3       14.84
4       17.00
5       12.55
        ...  
1486    13.93
1487    18.33
1488    16.55
1489    12.05
1490    18.60
Name: Mileage, Length: 1470, dtype: float64
0.0 33.54
0       1.030710
1       0.979129
3       0.942457
4       1.006857
5       0.874180
          ...   
1486    0.915325
1487    1.046512
1488    0.993441
1489    0.859273
1490    1.054562
Name: Mileage, Length: 1470, dtype: float64


In [11]:
#Process Engine
print(D['Engine'])
D['Engine'] = D['Engine'].str.split(' ').str[0] # extract the numeric part
D['Engine'] = pd.to_numeric(D['Engine'])# covert to numeric
print(D['Engine'])
# normalize the Engine data to 0.5 to 1.5
min_engine = D['Engine'].min()*np.ones(D.shape[0])
max_engine = D['Engine'].max()*np.ones(D.shape[0])
print(min_engine[0], max_engine[0])
#normalize to 0.5 to 1.5
D['Engine'] = 0.5*np.ones(D.shape[0]) + (D['Engine'] - min_engine) / (max_engine - min_engine)
print(D['Engine'])

0       1399 CC
1       1995 CC
3       1598 CC
4       1497 CC
5       2982 CC
         ...   
1486    2179 CC
1487    1968 CC
1488    1968 CC
1489    2179 CC
1490    1197 CC
Name: Engine, Length: 1470, dtype: object
0       1399
1       1995
3       1598
4       1497
5       2982
        ... 
1486    2179
1487    1968
1488    1968
1489    2179
1490    1197
Name: Engine, Length: 1470, dtype: int64
624.0 5461.0
0       0.660223
1       0.783440
3       0.701364
4       0.680484
5       0.987492
          ...   
1486    0.821480
1487    0.777858
1488    0.777858
1489    0.821480
1490    0.618462
Name: Engine, Length: 1470, dtype: float64


In [12]:
# Process Power
print(D['Power'])
D['Power'] = D['Power'].str.split(' ').str[0] # extract the numeric part
# there are null values in Power column, we need to handle them
# Check how many null values are there
null_power_idx = []
for i in range(len(D['Power'])):
    try:
        power_value = float(D['Power'].iloc[i])
    except:
        null_power_idx.append(i)
print(null_power_idx)
print(f"Number of null Power values: {len(null_power_idx)}")

# D = D[D['Power'] != 'null'] # remove rows with null Power values
# D['Power'] = pd.to_numeric(D['Power'])# covert to numeric
# print(D['Power'])
# # normalize the Power data to 0.5 to 1.5
# min_power = D['Power'].min()*np.ones(D.shape[0])
# max_power = D['Power'].max()*np.ones(D.shape[0])
# print(min_power[0], max_power[0])
# #normalize to 0.5 to 1.5
# D['Power'] = 0.5*np.ones(D.shape[0]) + (D['Power'] - min_power) / (max_power - min_power)
# print(D['Power'])

0           67 bhp
1          181 bhp
3       103.52 bhp
4          118 bhp
5        168.5 bhp
           ...    
1486    138.03 bhp
1487     167.7 bhp
1488    147.51 bhp
1489       120 bhp
1490     81.83 bhp
Name: Power, Length: 1470, dtype: object
[45, 131, 222, 279, 303, 422, 462, 482, 539, 554, 561, 747, 755, 826, 834, 852, 895, 1183, 1251, 1401]
Number of null Power values: 20


In [13]:
# remove rows with null Power values
D = D.drop(index=null_power_idx)
print(D.shape)

(1450, 19)


In [14]:
# Process Colour column
color_types = D['Colour'].unique().tolist() # check how many different color types
print(color_types)
for color in color_types: # encode each color type into a separate binary feature
    D[f'Colour_{color}'] = D['Colour'].apply(lambda x: 1 if x == color else 0)
D = D.drop(columns=['Colour']) # drop the original Colour column
print(D.columns) # print to check
print(D.shape)

['Black/Silver', 'Others', 'White']
Index(['ID', 'Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power',
       'Seats', 'No. of Doors', 'Fuel_Type_Diesel', 'Fuel_Type_Petrol',
       'Fuel_Type_CNG', 'Fuel_Type_LPG', 'Transmission_Manual',
       'Transmission_Automatic', 'Owner_Type_First', 'Owner_Type_Second',
       'Owner_Type_Third', 'Owner_Type_Fourth & Above', 'Colour_Black/Silver',
       'Colour_Others', 'Colour_White'],
      dtype='object')
(1450, 21)


In [15]:
# Process the Seats column
print(D['Seats'])
D['Seats'] = pd.to_numeric(D['Seats'])# covert to numeric
print(D['Seats'])
# normalize the Seats data to 0.5 to 1.5
min_seats = D['Seats'].min()*np.ones(D.shape[0])
max_seats = D['Seats'].max()*np.ones(D.shape[0])
print(min_seats[0], max_seats[0])
#normalize to 0.5 to 1.5
D['Seats'] = 0.5*np.ones(D.shape[0]) + (D['Seats'] - min_seats) / (max_seats - min_seats)
print(D.shape)
print(D['Seats'])

0       5
1       4
3       5
4       5
5       7
       ..
1486    7
1487    5
1488    5
1489    8
1490    5
Name: Seats, Length: 1450, dtype: object
0       5
1       4
3       5
4       5
5       7
       ..
1486    7
1487    5
1488    5
1489    8
1490    5
Name: Seats, Length: 1450, dtype: int64
2.0 10.0
(1450, 21)
0       0.875
1       0.750
3       0.875
4       0.875
5       1.125
        ...  
1486    1.125
1487    0.875
1488    0.875
1489    1.250
1490    0.875
Name: Seats, Length: 1450, dtype: float64


In [16]:
# Process No. of Doors column
print(D['No. of Doors'])
D['No. of Doors'] = pd.to_numeric(D['No. of Doors'])# covert to numeric
print(D['No. of Doors'])
# normalize the No. of Door data to 0.5 to 1.5
min_doors = D['No. of Doors'].min()*np.ones(D.shape[0])
max_doors = D['No. of Doors'].max()*np.ones(D.shape[0])
print(min_doors[0], max_doors[0])
#normalize to 0.5 to 1.5
D['No. of Doors'] = 0.5*np.ones(D.shape[0]) + (D['No. of Doors'] - min_doors) / (max_doors - min_doors)

print(D.shape)

0       4
1       4
3       4
4       4
5       5
       ..
1486    5
1487    4
1488    4
1489    5
1490    4
Name: No. of Doors, Length: 1450, dtype: int64
0       4
1       4
3       4
4       4
5       5
       ..
1486    5
1487    4
1488    4
1489    5
1490    4
Name: No. of Doors, Length: 1450, dtype: int64
2.0 5.0
(1450, 21)


In [18]:
# we save processed D to a csv file
D.to_csv('test_processed.csv', index=False)
