In [75]:
#%pip install ISLP

In [86]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import sklearn.model_selection as skm
from ISLP import load_data, confusion_table
from ISLP.models import ModelSpec as MS
import pandas as pd


from sklearn.tree import (DecisionTreeClassifier as DTC,
                          DecisionTreeRegressor as DTR,
                          plot_tree,
                          export_text)
from sklearn.metrics import (accuracy_score,
                             log_loss)
from sklearn.ensemble import \
     (RandomForestRegressor as RF,
      GradientBoostingRegressor as GBR)
from ISLP.bart import BART

In [87]:
# load the training data test.csv
test_data = pd.read_csv('test.csv')
test_data.head()
print(test_data.shape)
print(test_data.columns)
# Remove some features that are supposed to be not useful for prediction

# Access the 'Name' column data
name_column = test_data['Name']

# check how many different names are there in the name column
name_list = name_column.unique().tolist()
print(f"Number of unique names: {len(name_list)}") #207 => too much, ignore name feature

# drop the Name feature
test_data = test_data.drop(columns=['Name'])
print(test_data.shape)

# check how many different locations are there in the Location column
location_column = test_data['Location']
location_list = location_column.unique().tolist()
print(location_list)
print(f"Number of unique locations: {len(location_list)}") 
test_data = test_data.drop(columns=['Location'])# =12, also drop location feature
print(test_data.shape)

# Drop the feature "New-Price" because there is not many data
test_data = test_data.drop(columns=['New_Price'])
print(test_data.shape) #1491, 12 -> cannot drop more

model = MS(test_data.columns, intercept=False)
D = model.fit_transform(test_data)

(1491, 15)
Index(['ID', 'Name', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type',
       'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Colour',
       'Seats', 'No. of Doors', 'New_Price'],
      dtype='object')
Number of unique names: 161
(1491, 14)
['Pune', 'Chennai', '\\N', 'Mumbai', 'Coimbatore', 'Delhi', 'Bangalore', 'Kolkata', 'Jaipur', 'Ahmedabad', 'Hyderabad', 'Kochi']
Number of unique locations: 12
(1491, 13)
(1491, 12)


In [88]:
# Preprocess the year data
year_sum = 0
count = 0
for year in D['Year']:
    try:
        year = int(year)
        year_sum += year
        count += 1
    except:
        continue

# calculate the average year
avg_year = year_sum // count
print("Average year:", avg_year)

# replace the missing year with the average year
for year in D['Year']:
    try:
        year = int(year)
    except:
        # assign the average year
        year = avg_year
        # set to D
        D.loc[D['Year'] == year, 'Year'] = str(year)

# convert D['Year'] to int
D['Year'] = pd.to_numeric(D['Year'])
#normalize to 0.5 to 1.5
min_year = D['Year'].min()*np.ones(D.shape[0])
max_year = D['Year'].max()*np.ones(D.shape[0])
D['Year'] = 0.5*np.ones(D.shape[0]) + (D['Year'] - min_year) / (max_year - min_year)
print(D['Year'].min(), D['Year'].max())

#print(D['Year'])
print(D.shape)

Average year: 2013
0.5 1.5
(1491, 12)


In [89]:
# Preprocess the Kilometers_Driven data
# first calculate the average km
km_sum = 0
km_count = 0

# calculate the average year
for km in D['Kilometers_Driven']:
    try:
        km = int(km)
        km_sum += km
        km_count += 1
    except:
        continue
average_km = km_sum // km_count
print("Average Kilometers_Driven:", average_km)

# replace the missing km with the average km
for km in D['Kilometers_Driven']:
    try:
        km = int(km)
    except:
        # set to D
        D.loc[D['Kilometers_Driven'] == km, 'Kilometers_Driven'] = str(average_km)

D['Kilometers_Driven'] = pd.to_numeric(D['Kilometers_Driven'])# covert to numeric


# normalize the Kilometers_Driven data to 0.5 to 1.5
min_km = D['Kilometers_Driven'].min()*np.ones(D.shape[0])
max_km = D['Kilometers_Driven'].max()*np.ones(D.shape[0])

print(min_km[0], max_km[0])

#normalize to 0.5 to 1.5
D['Kilometers_Driven'] = 0.5*np.ones(D.shape[0]) + (D['Kilometers_Driven'] - min_km) / (max_km - min_km)
print(D['Kilometers_Driven'])



Average Kilometers_Driven: 57289
1000.0 720000.0
0       0.586787
1       0.622392
2       0.578288
3       0.543115
4       0.606096
          ...   
1486    0.609875
1487    0.557719
1488    0.550904
1489    0.600139
1490    0.518940
Name: Kilometers_Driven, Length: 1491, dtype: float64


In [90]:
# Fuel type preprocessing
fuel_types = D['Fuel_Type'].unique().tolist() # check how many different fuel types
print(fuel_types)
for fuel in fuel_types: # encode each fuel type into a separate binary feature
    D[f'Fuel_Type_{fuel}'] = D['Fuel_Type'].apply(lambda x: 1 if x == fuel else 0)
D = D.drop(columns=['Fuel_Type']) # drop the original Fuel_Type column
print(D.columns) # print to check
print(D.shape)

['Diesel', 'Petrol', 'CNG', 'LPG']
Index(['ID', 'Year', 'Kilometers_Driven', 'Transmission', 'Owner_Type',
       'Mileage', 'Engine', 'Power', 'Colour', 'Seats', 'No. of Doors',
       'Fuel_Type_Diesel', 'Fuel_Type_Petrol', 'Fuel_Type_CNG',
       'Fuel_Type_LPG'],
      dtype='object')
(1491, 15)


In [91]:
# Process the 'Transmission' column
print(D.shape)
transmission_types = D['Transmission'].unique().tolist() # check how many different transmission types
print(transmission_types)
for transmission in transmission_types: # encode each transmission type into a separate binary feature
    D[f'Transmission_{transmission}'] = D['Transmission'].apply(lambda x: 1 if x == transmission else 0)
D = D.drop(columns=['Transmission']) # drop the original Transmission column
print(D.columns) # print to check
print(D['Transmission_Manual']) # print to check
print(D.shape)

(1491, 15)
['Manual', 'Automatic', '\\N']
Index(['ID', 'Year', 'Kilometers_Driven', 'Owner_Type', 'Mileage', 'Engine',
       'Power', 'Colour', 'Seats', 'No. of Doors', 'Fuel_Type_Diesel',
       'Fuel_Type_Petrol', 'Fuel_Type_CNG', 'Fuel_Type_LPG',
       'Transmission_Manual', 'Transmission_Automatic', 'Transmission_\N'],
      dtype='object')
0       1
1       0
2       0
3       0
4       1
       ..
1486    1
1487    0
1488    0
1489    1
1490    1
Name: Transmission_Manual, Length: 1491, dtype: int64
(1491, 17)


In [92]:
#Process the 'Owner_Type' column
owner_types = D['Owner_Type'].unique().tolist() # check how many different owner types
print(owner_types)
for owner in owner_types: # encode each owner type into a separate binary feature
    D[f'Owner_Type_{owner}'] = D['Owner_Type'].apply(lambda x: 1 if x == owner else 0)
D = D.drop(columns=['Owner_Type']) # drop the original Owner_Type column
# print(D.columns) # print to check
# print(D['Owner_Type_Second']) # print to check


['First', 'Second', 'Third', '\\N', 'Fourth & Above']


In [93]:
#Preprocess Mileage
print(D['Mileage'])
D['Mileage'] = D['Mileage'].str.split(' ').str[0] # extract the numeric part

# compute the average mileage
mileage_sum = 0
mileage_count = 0
for mileage in D['Mileage']:
    try:
        mileage = float(mileage)
        mileage_sum += mileage
        mileage_count += 1
    except:
        continue

average_mileage = mileage_sum / mileage_count
print("Average Mileage:", average_mileage)

# replace the missing mileage with the average mileage
for mileage in D['Mileage']:
    try:
        mileage = float(mileage)
    except:
        # set to D
        D.loc[D['Mileage'] == mileage, 'Mileage'] = str(average_mileage)

D['Mileage'] = pd.to_numeric(D['Mileage'])# covert to numeric
print(D['Mileage'])
# normalize the Mileage data to 0.5 to 1.5
min_mileage = D['Mileage'].min()*np.ones(D.shape[0])
max_mileage = D['Mileage'].max()*np.ones(D.shape[0])
print(min_mileage[0], max_mileage[0])
#normalize to 0.5 to 1.5
D['Mileage'] = 0.5*np.ones(D.shape[0]) + (D['Mileage'] - min_mileage) / (max_mileage - min_mileage)
print(D['Mileage'])


0        17.8 kmpl
1       16.07 kmpl
2        12.4 kmpl
3       14.84 kmpl
4        17.0 kmpl
           ...    
1486    13.93 kmpl
1487    18.33 kmpl
1488    16.55 kmpl
1489    12.05 kmpl
1490     18.6 kmpl
Name: Mileage, Length: 1491, dtype: object
Average Mileage: 18.104346076458743
0       17.80
1       16.07
2       12.40
3       14.84
4       17.00
        ...  
1486    13.93
1487    18.33
1488    16.55
1489    12.05
1490    18.60
Name: Mileage, Length: 1491, dtype: float64
0.0 33.54
0       1.030710
1       0.979129
2       0.869708
3       0.942457
4       1.006857
          ...   
1486    0.915325
1487    1.046512
1488    0.993441
1489    0.859273
1490    1.054562
Name: Mileage, Length: 1491, dtype: float64


In [94]:
#Process Engine
print(D['Engine'])
D['Engine'] = D['Engine'].str.split(' ').str[0] # extract the numeric part

sum_engine = 0
count_engine = 0
for engine in D['Engine']:
    try:
        engine = int(engine)
        sum_engine += engine
        count_engine += 1
    except:
        continue
average_engine = sum_engine // count_engine
print("Average Engine:", average_engine)
# replace the missing engine with the average engine
for engine in D['Engine']:
    try:
        engine = int(engine)
    except:
        # set to D
        D.loc[D['Engine'] == engine, 'Engine'] = str(average_engine)

D['Engine'] = pd.to_numeric(D['Engine'])# covert to numeric
print(D['Engine'])
# normalize the Engine data to 0.5 to 1.5
min_engine = D['Engine'].min()*np.ones(D.shape[0])
max_engine = D['Engine'].max()*np.ones(D.shape[0])
print(min_engine[0], max_engine[0])
#normalize to 0.5 to 1.5
D['Engine'] = 0.5*np.ones(D.shape[0]) + (D['Engine'] - min_engine) / (max_engine - min_engine)
print(D['Engine'])

0       1399 CC
1       1995 CC
2       2698 CC
3       1598 CC
4       1497 CC
         ...   
1486    2179 CC
1487    1968 CC
1488    1968 CC
1489    2179 CC
1490    1197 CC
Name: Engine, Length: 1491, dtype: object
Average Engine: 1635
0       1399
1       1995
2       2698
3       1598
4       1497
        ... 
1486    2179
1487    1968
1488    1968
1489    2179
1490    1197
Name: Engine, Length: 1491, dtype: int64
624.0 5461.0
0       0.660223
1       0.783440
2       0.928778
3       0.701364
4       0.680484
          ...   
1486    0.821480
1487    0.777858
1488    0.777858
1489    0.821480
1490    0.618462
Name: Engine, Length: 1491, dtype: float64


In [98]:
# Process Power
D['Power'] = D['Power'].str.split(' ').str[0] # extract the numeric part
# calculate the average power
power_sum = 0
power_count = 0
for power in D['Power']:
    try:
        power = float(power)
        power_sum += power
        power_count += 1
    except:
        continue
average_power = power_sum / power_count
print("Average Power:", average_power)
# replace the missing power with the average power
for power in D['Power']:
    try:
        power = float(power)
    except:
        # set to D
        D.loc[D['Power'] == power, 'Power'] = str(average_power)

D['Power'] = pd.to_numeric(D['Power'])# covert to numeric
print(D['Power'])
# normalize the Power data to 0.5 to 1.5
min_power = D['Power'].min()*np.ones(D.shape[0])
max_power = D['Power'].max()*np.ones(D.shape[0])
print(min_power[0], max_power[0])
#normalize to 0.5 to 1.5
D['Power'] = 0.5*np.ones(D.shape[0]) + (D['Power'] - min_power) / (max_power - min_power)
print(D.shape)


AttributeError: Can only use .str accessor with string values!

In [None]:
# Process Colour column
color_types = D['Colour'].unique().tolist() # check how many different color types
print(color_types)
for color in color_types: # encode each color type into a separate binary feature
    D[f'Colour_{color}'] = D['Colour'].apply(lambda x: 1 if x == color else 0)
D = D.drop(columns=['Colour']) # drop the original Colour column
print(D.columns) # print to check
print(D.shape)

In [None]:
# Process the Seats column
# calculate the average seats
seats_sum = 0
seats_count = 0
for seats in D['Seats']:
    try:
        seats = int(seats)
        seats_sum += seats
        seats_count += 1
    except:
        continue

average_seats = seats_sum / seats_count
print("Average Seats:", average_seats)
# replace the missing seats with the average seats
for seats in D['Seats']:
    try:
        seats = int(seats)
    except:
        # set to D
        D.loc[D['Seats'] == seats, 'Seats'] = str(average_seats)


D['Seats'] = pd.to_numeric(D['Seats'])# covert to numeric
print(D['Seats'])
# normalize the Seats data to 0.5 to 1.5
min_seats = D['Seats'].min()*np.ones(D.shape[0])
max_seats = D['Seats'].max()*np.ones(D.shape[0])
print(min_seats[0], max_seats[0])
#normalize to 0.5 to 1.5
D['Seats'] = 0.5*np.ones(D.shape[0]) + (D['Seats'] - min_seats) / (max_seats - min_seats)
print(D.shape)
print(D['Seats'])

In [None]:
# Process No. of Doors column
print(D['No. of Doors'])
D['No. of Doors'] = pd.to_numeric(D['No. of Doors'])# covert to numeric
print(D['No. of Doors'])
# normalize the No. of Door data to 0.5 to 1.5
min_doors = D['No. of Doors'].min()*np.ones(D.shape[0])
max_doors = D['No. of Doors'].max()*np.ones(D.shape[0])
print(min_doors[0], max_doors[0])
#normalize to 0.5 to 1.5
D['No. of Doors'] = 0.5*np.ones(D.shape[0]) + (D['No. of Doors'] - min_doors) / (max_doors - min_doors)

print(D.shape)

In [None]:
# we save processed D to a csv file
D.to_csv('test_processed.csv', index=False)
