In [2]:
import pandas as pd
from datetime import datetime

# Load dataset
data = pd.read_csv('train.csv')

# a) Handling Missing Values and empty strings in all the columns
missing_values = data.isnull().sum()

# Impute missing values and handle empty strings for numerical columns
numerical_columns = ['Year', 'Mileage', 'Engine', 'Power', 'New_Price']
categorical_columns = ['Fuel_Type', 'Transmission']

for column in numerical_columns:
    # Identify the  handle non-numeric characters by changing them with NaN
    data[column] = pd.to_numeric(data[column], errors='coerce')
    data[column].fillna(data[column].mean(), inplace=True)

for column in categorical_columns:
    data[column].fillna(data[column].mode()[0], inplace=True)

# b) Removing Units, No need to clear the units again, we have already completed it while handling missing values.

# c) Change the categorical variables (“Fuel_Type” and “Transmission”) into numerical one hot/Encoding for Categorical Variables
data = pd.get_dummies(data, columns=['Fuel_Type', 'Transmission'], prefix=['Fuel_Type', 'Transmission'])

# d) Create one more New Feature - Current Age of the Car
current_year = datetime.now().year
data['Current_Age'] = current_year - data['Year']

# Display the first few rows of the changed dataset values.
print(data.head())

# Store the processed dataset.
data.to_csv('processed_train.csv', index=False)


   Unnamed: 0                              Name    Location  Year  \
0           1  Hyundai Creta 1.6 CRDi SX Option        Pune  2015   
1           2                      Honda Jazz V     Chennai  2011   
2           3                 Maruti Ertiga VDI     Chennai  2012   
3           4   Audi A4 New 2.0 TDI Multitronic  Coimbatore  2013   
4           6            Nissan Micra Diesel XV      Jaipur  2013   

   Kilometers_Driven Owner_Type  Mileage  Engine  Power  Seats  New_Price  \
0              41000      First      NaN     NaN    NaN    5.0        NaN   
1              46000      First      NaN     NaN    NaN    5.0        NaN   
2              87000      First      NaN     NaN    NaN    7.0        NaN   
3              40670     Second      NaN     NaN    NaN    5.0        NaN   
4              86999      First      NaN     NaN    NaN    5.0        NaN   

   Price  Fuel_Type_Diesel  Fuel_Type_Electric  Fuel_Type_Petrol  \
0  12.50                 1                   0        