In [None]:
import numpy as np
import pandas as pd

In [51]:
# load the training data test.csv
test_data = pd.read_csv('test.csv')
test_data.head()
print(test_data.shape)

# Remove some features that are supposed to be not useful for prediction
# drop the Name feature
test_data = test_data.drop(columns=['Name']) # Drop Name feature
test_data = test_data.drop(columns=['Location'])# Drop Location feature
test_data = test_data.drop(columns=['New_Price']) # Drop New_Price feature
print(test_data.shape) #1491, 12 -> cannot drop more

def replace_data(column, value):
    data = column
    for i in range(len(column)):
        try:
            data_value = float(column[i])
        except:
            data[i] = str(value)
            continue
        if data[i] == '\\N' or data[i] == '':
            data[i] = str(value)
            continue
    return data

def calculate_average (data):
    sum = 0
    count = 0
    for i in range(len(data)):
        if data[i] != '\\N' and data[i] != '':
            try:
                sum += float(data[i])
                count += 1
            except:
                continue
    average = sum / count
    return average

# preprocess the data
# Use year to calculate the age of the car
test_data['Year'] = pd.to_numeric(test_data['Year'])
current_year = 2025*np.ones(test_data.shape[0]) # current year is 2025
test_data['Age'] = current_year - test_data['Year'] # add Age feature instead of Year
test_data = test_data.drop(columns=['Year']) # drop year column

print(test_data.shape)

# Preprocess the Kilometers_Driven data
average_km = int(calculate_average(test_data['Kilometers_Driven']))
print("Average Kilometers_Driven:", average_km)

# replace the missing km with the average km
test_data['Kilometers_Driven'] = replace_data(test_data['Kilometers_Driven'], average_km)
test_data['Kilometers_Driven'] = pd.to_numeric(test_data['Kilometers_Driven'])# covert to numeric
print(test_data.shape)

# Fuel type preprocessing
fuel_types = test_data['Fuel_Type'].unique().tolist() # check how many different fuel types
print(fuel_types)
for fuel in fuel_types: # encode each fuel type into a separate binary feature
    test_data[f'Fuel_Type_{fuel}'] = test_data['Fuel_Type'].apply(lambda x: 1 if x == fuel else 0)
test_data = test_data.drop(columns=['Fuel_Type']) # drop the original Fuel_Type column
print(test_data.shape)

# Process the 'Transmission' column
transmission_types = test_data['Transmission'].unique().tolist() # check how many different transmission types
# remove the \\N type
if '\\N' in transmission_types:
    transmission_types.remove('\\N')

# Assign a random type between automatic and manual if the type is \N
for i in range(len(test_data['Transmission'])):
    if test_data['Transmission'].iloc[i] == '\\N':
        test_data['Transmission'].iloc[i] = np.random.choice(transmission_types)

# encode each transmission type into a separate binary feature
for transmission in transmission_types:
    test_data[f'Transmission_{transmission}'] = test_data['Transmission'].apply(lambda x: 1 if x == transmission else 0)

test_data = test_data.drop(columns=['Transmission']) # drop the original Transmission column
print(test_data.shape)

#---------------Process the 'Owner_Type' column---------------------------------------------
#------------------------------------------------------------------------------------------
owner_types = test_data['Owner_Type'].unique().tolist() # check how many different owner types

# remove the \\N type
if '\\N' in owner_types:
    owner_types.remove('\\N')

# Assign a random type between automatic and manual if the type is \N
for i in range(len(test_data['Owner_Type'])):
    if test_data['Owner_Type'].iloc[i] == '\\N':
        test_data['Owner_Type'].iloc[i] = np.random.choice(owner_types)

for owner in owner_types: # encode each owner type into a separate binary feature
    test_data[f'Owner_Type_{owner}'] = test_data['Owner_Type'].apply(lambda x: 1 if x == owner else 0)
test_data = test_data.drop(columns=['Owner_Type']) # drop the original Owner_Type column
print(test_data.shape)

#Preprocess Mileage
test_data['Mileage'] = test_data['Mileage'].str.split(' ').str[0] # extract the numeric part
mileage_average = calculate_average(test_data['Mileage'])
print("Average Mileage:", mileage_average)
# replace the missing mileage with the average mileage
test_data['Mileage'] = replace_data(test_data['Mileage'], mileage_average)
test_data['Mileage'] = pd.to_numeric(test_data['Mileage'])# covert to numeric
print(test_data.shape)

#Process Engine
test_data['Engine'] = test_data['Engine'].str.split(' ').str[0] # extract the numeric part
# calculate the average engine power
average_engine_volumne = int(calculate_average(test_data['Engine']))
print("Average Engine Volume:", average_engine_volumne)
# replace the missing engine volume with the average engine volume
test_data['Engine'] = replace_data(test_data['Engine'], average_engine_volumne)
test_data['Engine'] = pd.to_numeric(test_data['Engine'])# covert to numeric
print(test_data.shape)

# Process Power
test_data['Power'] = test_data['Power'].str.split(' ').str[0] # extract the numeric part
# calculate the average power
average_power = calculate_average(test_data['Power'])
print("Average Power:", average_power)
# replace the missing power with the average power
test_data['Power'] = replace_data(test_data['Power'], average_power)
test_data['Power'] = pd.to_numeric(test_data['Power'])# covert to numeric
print(test_data.shape)

# Process Colour column - Use specific order to match training data
color_types = ['Others', 'Black/Silver', 'White']  # Define order to match training data

# Get current colors in the data and handle missing values
current_colors = test_data['Colour'].unique().tolist()
if '\\N' in current_colors:
    available_colors = [c for c in current_colors if c != '\\N']
    # Assign a random color to missing values
    for i in range(len(test_data['Colour'])):
        if test_data['Colour'].iloc[i] == '\\N':
            test_data['Colour'].iloc[i] = np.random.choice(available_colors)

# Encode each color type in the specified order
for color in color_types: 
    test_data[f'Colour_{color}'] = test_data['Colour'].apply(lambda x: 1 if x == color else 0)
test_data = test_data.drop(columns=['Colour']) # drop the original Colour column
print(test_data.shape)

# Process the Seats column
# calculate the average seats
average_seats = int(calculate_average(test_data['Seats']))
print("Average Seats:", average_seats)
# replace the missing seats with the average seats
test_data['Seats'] = replace_data(test_data['Seats'], average_seats)
test_data['Seats'] = pd.to_numeric(test_data['Seats'])# covert to numeric
print(test_data.shape)

# Process No. of Doors column
#calculate the average no. of doors
average_doors = int(calculate_average(test_data['No. of Doors']))
print("Average No. of Doors:", average_doors)
# replace the missing no. of doors with the average no. of doors
test_data['No. of Doors'] = replace_data(test_data['No. of Doors'], average_doors)
test_data['No. of Doors'] = pd.to_numeric(test_data['No. of Doors'])# covert to numeric
print(test_data.shape)
print(test_data.columns) # print to check


(1491, 15)
(1491, 12)
(1491, 12)
Average Kilometers_Driven: 57289
(1491, 12)
['Diesel', 'Petrol', 'CNG', 'LPG']
(1491, 15)
(1491, 16)
(1491, 19)
Average Mileage: 18.104346076458743
(1491, 19)
Average Engine Volume: 1635
(1491, 19)
Average Power: 114.26195622435026
(1491, 19)
(1491, 21)
Average Seats: 5
(1491, 21)
Average No. of Doors: 4
(1491, 21)
Index(['ID', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats',
       'No. of Doors', 'Age', 'Fuel_Type_Diesel', 'Fuel_Type_Petrol',
       'Fuel_Type_CNG', 'Fuel_Type_LPG', 'Transmission_Manual',
       'Transmission_Automatic', 'Owner_Type_First', 'Owner_Type_Second',
       'Owner_Type_Third', 'Owner_Type_Fourth & Above', 'Colour_Others',
       'Colour_Black/Silver', 'Colour_White'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[i] = str(value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[i] = str(value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[i] = str(value)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behav

In [50]:
# we save processed test_data to a csv file
test_data.to_csv('test_processed.csv', index=False)