<a href="https://colab.research.google.com/github/Numanur/data-science-ml/blob/main/Data_preprocessing_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# **1. Make sure no missing data**

In [65]:
car = pd.read_csv("/content/drive/MyDrive/Colab Datasets/car-sales-extended-missing-data.csv")
car.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [61]:
#Check if any value is missing or NaN
print(car.isnull().sum())


Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64


In [62]:
car_price = car.dropna()
car_price

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
994,BMW,Blue,163322.0,3.0,31666.0
995,Toyota,Black,35820.0,4.0,32042.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


# **2. Make sure all are numerical data**

In [71]:
x = car_price.drop("Price", axis=1)
y = car_price["Price"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

In [66]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(x_train, y_train)#here we need to convert the data to numerical

ValueError: could not convert string to float: 'BMW'

#**3. Convert the string to numerical data**

**3.1: Convert the categorical data**

In [68]:
car_price["Doors"].value_counts()

Unnamed: 0_level_0,count
Doors,Unnamed: 1_level_1
4.0,659
5.0,61
3.0,53


So "Door" is also a category here.

In [70]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")
transformed_x = transformer.fit_transform(x)

In [13]:
pd.DataFrame(transformed_x)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
768,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,163322.0
769,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
770,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
771,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


**Now the string values or categories are converted to numerical data**

In [73]:
#now fit into the model
np.random.seed(1)
x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size = 0.2)
model.fit(x_train, y_train)

In [74]:
model.score(x_train, y_train)

0.8933311493436329

In [75]:
model.score(x_test, y_test)

0.333470988194001

#**Now we will fill the missing data**

In [77]:
car_missing_data = pd.read_csv("/content/drive/MyDrive/Colab Datasets/car-sales-extended-missing-data.csv")
car_missing_data.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


**Check how many data are missing**

In [78]:
car_missing_data.isna().sum()

Unnamed: 0,0
Make,49
Colour,50
Odometer (KM),50
Doors,50
Price,50


**Option 1: FIlling missing values with Pandas**

In [79]:
car_missing_data["Make"].fillna("missing", inplace=True)
car_missing_data["Colour"].fillna("missing", inplace=True)
car_missing_data["Odometer (KM)"].fillna(car_missing_data["Odometer (KM)"].mean(), inplace=True)
car_missing_data["Doors"].fillna(4, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  car_missing_data["Make"].fillna("missing", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  car_missing_data["Colour"].fillna("missing", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which

In [80]:
car_missing_data.isna().sum()

Unnamed: 0,0
Make,0
Colour,0
Odometer (KM),0
Doors,0
Price,50


In [81]:
car_missing_data.dropna(inplace=True)

In [82]:
car_missing_data.isna().sum()

Unnamed: 0,0
Make,0
Colour,0
Odometer (KM),0
Doors,0
Price,0


**So my dataset has no missing values**

#**Encode data to numerical**

In [84]:
x = car_missing_data.drop("Price", axis=1)
y = car_missing_data["Price"]

In [85]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")
transformed_x = transformer.fit_transform(car_missing_data)

In [86]:
pd.DataFrame(transformed_x)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,35431.0,15323.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0,19943.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,84714.0,28343.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,154365.0,13434.0
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0,14043.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
945,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0,32042.0
946,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,155144.0,5716.0
947,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0,31570.0
948,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,215883.0,4001.0


#**Split Data**

In [87]:
np.random.seed(4)
x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.3)

In [88]:
model.fit(x_train, y_train)

In [89]:
model.score(x_train, y_train)

0.9999186918076239

In [90]:
model.score(x_test, y_test)

0.9990565618587859

In [91]:
y_pred = model.predict(x_test)

In [92]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
# Metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Print results
print(f"R² Score: {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

R² Score: 0.9991
Mean Absolute Error (MAE): 60.9654
Mean Squared Error (MSE): 71704.8060
Root Mean Squared Error (RMSE): 267.7775


#**Option 2: Missing value handling with Scikit-Learn**

#1. Import csv data

In [93]:
car_missing_data = pd.read_csv("/content/drive/MyDrive/Colab Datasets/car-sales-extended-missing-data.csv")
car_missing_data.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


#2. Check if Null or missing values

In [94]:
car_missing_data.isna().sum()

Unnamed: 0,0
Make,49
Colour,50
Odometer (KM),50
Doors,50
Price,50


#3. Remove Price missing rows

In [95]:
car_missing_data.dropna(subset=["Price"], inplace=True)
car_missing_data.isna().sum()

Unnamed: 0,0
Make,47
Colour,46
Odometer (KM),48
Doors,47
Price,0


#4. Split into X and Y

In [97]:
x = car_missing_data.drop("Price", axis=1)#here we have missing data
y = car_missing_data["Price"]

#5. Fill the value with Scikit-Learn

In [100]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
#Fill the categorical data with missing and numerical with mean
cat_imputer= SimpleImputer(strategy = "constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

#Define the columns
cat_features = ["Make", "Colour"]
door_feature = ["Doors"]
num_feature = ["Odometer (KM)"]

#Create the imputer
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_feature", door_imputer, door_feature),
    ("num_imputer", num_imputer, num_feature)

])

#Transform the X data
filled_x = imputer.fit_transform(x)


#6. Convert this data to Dataframe

In [103]:
car_sales_filled = pd.DataFrame(filled_x, columns=["Make", "Colour", "Doors", "Odometer (KM)"])
car_sales_filled.head()

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Honda,White,4.0,35431.0
1,BMW,Blue,5.0,192714.0
2,Honda,White,4.0,84714.0
3,Toyota,White,4.0,154365.0
4,Nissan,Blue,3.0,181577.0


#7. Now Encode this dataset into proper representation

In [105]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")
transformed_x = transformer.fit_transform(car_sales_filled)


#8. Now fit this to a model

In [140]:
np.random.seed(9)
x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.3)
model = RandomForestRegressor()
model.fit(x_train, y_train)

#9. Check the score

In [123]:
model.score(x_train, y_train)

0.8777294925125443

In [141]:
model.score(x_test, y_test)

0.2969926175335985