In [70]:
import numpy as np
import pandas as pd

In [71]:
# load the training data train.csv
train_data = pd.read_csv('train.csv')
train_data.head()
print(train_data.shape)
print(train_data.columns)
# Remove some features that are supposed to be not useful for prediction

# Access the 'Name' column data
name_column = train_data['Name']

# check how many different names are there in the name column
name_list = name_column.unique().tolist()
print(f"Number of unique names: {len(name_list)}") #207 => too much, ignore name feature

# drop the Name feature
train_data = train_data.drop(columns=['Name'])
print(train_data.shape)

# check how many different locations are there in the Location column
location_column = train_data['Location']
location_list = location_column.unique().tolist()
print(location_list)
print(f"Number of unique locations: {len(location_list)}") 
train_data = train_data.drop(columns=['Location'])# =12, also drop location feature
print(train_data.shape)

# Drop the feature "New-Price" because there is not many data
train_data = train_data.drop(columns=['New_Price'])
print(train_data.shape)

# Drop the ID column
train_data = train_data.drop(columns=['ID'])

# remove all the \N rows in all the columns
for col in train_data.columns:
    train_data = train_data[train_data[col] != '\\N']

print("traind data shape after removing \\N rows:", train_data.shape)
    

(4470, 16)
Index(['ID', 'Name', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type',
       'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Colour',
       'Seats', 'No. of Doors', 'New_Price', 'Price'],
      dtype='object')
Number of unique names: 207
(4470, 15)
['Coimbatore', 'Kochi', 'Hyderabad', 'Kolkata', 'Bangalore', 'Delhi', 'Pune', 'Chennai', 'Mumbai', 'Ahmedabad', 'Jaipur', '\\N']
Number of unique locations: 12
(4470, 14)
(4470, 13)
traind data shape after removing \N rows: (4419, 12)


In [72]:
# Use year to calculate the age of the car
train_data['Year'] = pd.to_numeric(train_data['Year'])
current_year = 2025*np.ones(train_data.shape[0]) # current year is 2025
train_data['Age'] = current_year - train_data['Year'] # add Age feature instead of Year
train_data = train_data.drop(columns=['Year']) # drop year column

# Convert Kilometers_Driven to numeric
train_data['Kilometers_Driven'] = pd.to_numeric(train_data['Kilometers_Driven'])# convert to numeric
print(train_data.shape)

# Fuel type preprocessing
fuel_types = train_data['Fuel_Type'].unique().tolist() # check how many different fuel types
for fuel in fuel_types: # encode each fuel type into a separate binary feature
    train_data[f'Fuel_Type_{fuel}'] = train_data['Fuel_Type'].apply(lambda x: 1 if x == fuel else 0)
train_data = train_data.drop(columns=['Fuel_Type']) # drop the original Fuel_Type column

print(train_data.shape)

# Process the 'Transmission' column
transmission_types = train_data['Transmission'].unique().tolist() # check how many different transmission types

for transmission in transmission_types: # encode each transmission type into a separate binary feature
    train_data[f'Transmission_{transmission}'] = train_data['Transmission'].apply(lambda x: 1 if x == transmission else 0)
train_data = train_data.drop(columns=['Transmission']) # drop the original Transmission column

print(train_data.shape)

#Process the 'Owner_Type' column
owner_types = train_data['Owner_Type'].unique().tolist() # check how many different owner types
for owner in owner_types: # encode each owner type into a separate binary feature
    train_data[f'Owner_Type_{owner}'] = train_data['Owner_Type'].apply(lambda x: 1 if x == owner else 0)
train_data = train_data.drop(columns=['Owner_Type']) # drop the original Owner_Type column
print(train_data.shape)

#Preprocess Mileage
train_data['Mileage'] = train_data['Mileage'].str.split(' ').str[0] # extract the numeric part
train_data['Mileage'] = pd.to_numeric(train_data['Mileage'])# covert to numeric
print(train_data.shape)

#Process Engine
train_data['Engine'] = train_data['Engine'].str.split(' ').str[0] # extract the numeric part
train_data['Engine'] = pd.to_numeric(train_data['Engine'])# covert to numeric
print(train_data.shape)

# Process Power
train_data['Power'] = train_data['Power'].str.split(' ').str[0] # extract the numeric part
# remove the null string in Power column
train_data = train_data[train_data['Power'] != 'null']

train_data['Power'] = pd.to_numeric(train_data['Power'])# covert to numeric
print(train_data.shape)

# Process Colour column
color_types = train_data['Colour'].unique().tolist() # check how many different color types
for color in color_types: # encode each color type into a separate binary feature
    train_data[f'Colour_{color}'] = train_data['Colour'].apply(lambda x: 1 if x == color else 0)
train_data = train_data.drop(columns=['Colour']) # drop the original Colour column
print(train_data.shape)

# Process the Seats column
train_data['Seats'] = pd.to_numeric(train_data['Seats'])# covert to numeric
print(train_data.shape)

# Process No. of Doors column
train_data['No. of Doors'] = pd.to_numeric(train_data['No. of Doors'])# covert to numeric
print(train_data.shape)

# last, we process the response variable Price
train_data['Price'] = pd.to_numeric(train_data['Price'])# covert to numeric
print(train_data.shape)
print(train_data.columns)

(4419, 12)
(4419, 15)
(4419, 16)
(4419, 19)
(4419, 19)
(4419, 19)
(4336, 19)
(4336, 21)
(4336, 21)
(4336, 21)
(4336, 21)
Index(['Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats',
       'No. of Doors', 'Price', 'Age', 'Fuel_Type_Diesel', 'Fuel_Type_Petrol',
       'Fuel_Type_CNG', 'Fuel_Type_LPG', 'Transmission_Manual',
       'Transmission_Automatic', 'Owner_Type_First', 'Owner_Type_Second',
       'Owner_Type_Third', 'Owner_Type_Fourth & Above', 'Colour_Others',
       'Colour_Black/Silver', 'Colour_White'],
      dtype='object')


In [73]:
# we save processed train_data to a csv file
train_data.to_csv('train_processed.csv', index=False)