# Main Library

In [1]:
# Reading Data & Visullization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

# Preprocessing and Train_Test & Hyper Parameter 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

# Building model
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

# XGBoost
import xgboost as xgb

import tensorflow as tf

### Reading Data

In [2]:
data = pd.read_csv(r"D:\Courses language programming\5_Machine Learning\Dataset For Machine Learning\Vehicle_DataSet\car data.csv")
data.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [3]:
# information from Data

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [4]:
# Checking None& Missing Value 

data.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

#### 1 ==> Data Not Have Missing Value 

In [5]:
# Clearnig column ==> Car_Name

data.drop(columns="Car_Name" , axis=1, inplace=True)

data.head(10)

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0
5,2018,9.25,9.83,2071,Diesel,Dealer,Manual,0
6,2015,6.75,8.12,18796,Petrol,Dealer,Manual,0
7,2015,6.5,8.61,33429,Diesel,Dealer,Manual,0
8,2016,8.75,8.89,20273,Diesel,Dealer,Manual,0
9,2015,7.45,8.92,42367,Diesel,Dealer,Manual,0


### Make Encoder ==> Data for col[ Seller_Type, Fuel_Type, Transmission]

In [6]:
print(data["Transmission"].value_counts())
print(data["Fuel_Type"].value_counts())
print(data["Seller_Type"].value_counts())

Transmission
Manual       261
Automatic     40
Name: count, dtype: int64
Fuel_Type
Petrol    239
Diesel     60
CNG         2
Name: count, dtype: int64
Seller_Type
Dealer        195
Individual    106
Name: count, dtype: int64


In [7]:
La = LabelEncoder()

columns = ["Fuel_Type", "Seller_Type", "Transmission"]

for col in  columns:
    data[col] = La.fit_transform(data[col])
    
data.head(10)

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,2014,3.35,5.59,27000,2,0,1,0
1,2013,4.75,9.54,43000,1,0,1,0
2,2017,7.25,9.85,6900,2,0,1,0
3,2011,2.85,4.15,5200,2,0,1,0
4,2014,4.6,6.87,42450,1,0,1,0
5,2018,9.25,9.83,2071,1,0,1,0
6,2015,6.75,8.12,18796,2,0,1,0
7,2015,6.5,8.61,33429,1,0,1,0
8,2016,8.75,8.89,20273,1,0,1,0
9,2015,7.45,8.92,42367,1,0,1,0


# Splitting Data

In [8]:
X = data.drop(columns="Selling_Price", axis=1)
Y = data["Selling_Price"]

print(X.shape, Y.shape)

(301, 7) (301,)


In [9]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.7, random_state=42)

# Building Model

## Model 1 ==> Linear Regression

In [10]:
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)

Train_prediction = lin_reg.predict(x_train)
TrainScore_prediction = r2_score(y_train, Train_prediction)

print(f"The r2_score Prediction Of Train is {TrainScore_prediction}")

The r2_score Prediction Of Train is 0.8761249192923499


In [11]:
Test_prediction = lin_reg.predict(x_test)
TestScore_prediction = r2_score(y_test, Test_prediction)

print(f"The r2_score Prediction Of Test is {TestScore_prediction}")

The r2_score Prediction Of Test is 0.8773175030337514


## Model 2 ==> Random Forest

In [12]:
RF_reg = RandomForestRegressor(n_estimators=50, max_depth=100)
RF_reg.fit(x_train, y_train)

Train_prediction_RF = RF_reg.predict(x_train)
TrainScore_prediction_RF = r2_score(y_train, Train_prediction_RF)

print(f"The r2_score Prediction Of Train is {TrainScore_prediction_RF}")

The r2_score Prediction Of Train is 0.9829253703656697


In [13]:
Test_prediction_RF = RF_reg.predict(x_test)
TestScore_prediction_RF = r2_score(y_test, Test_prediction_RF)

print(f"The r2_score Prediction Of Test is {TestScore_prediction_RF}")

The r2_score Prediction Of Test is 0.9574011095785784


## Model 3 ==> AdaBoostRegressor

In [14]:
Adaboost_reg = AdaBoostRegressor(estimator=DecisionTreeRegressor(max_depth=100, 
                                                                    min_samples_split=5,
                                                                    min_samples_leaf=6, 
                                                                    random_state=42),
                              n_estimators=100,
                              learning_rate=0.2)


Adaboost_reg.fit(x_train, y_train)

print(f"The predict Score Train is ==> {Adaboost_reg.score(x_train, y_train)}")
print("%----------------------------------------------------------%")
print(f"The predict Score Test is ==> {Adaboost_reg.score(x_test, y_test)}")

The predict Score Train is ==> 0.9936839756501196
%----------------------------------------------------------%
The predict Score Test is ==> 0.9657657254622236


## Model 4 ==>XGBoost

In [15]:
model_xgb = xgb.XGBRegressor(n_estimators=100, max_depth=20,
                             learning_rate=0.2,
                             min_child_weight=0.1, random_state=42,
                             missing=0.5)
                             
model_xgb.fit(x_train, y_train)

print(f"The predict Score Train is ==> {model_xgb.score(x_train, y_train)}")
print("%----------------------------------------------------------%")
print(f"The predict Score Test is ==> {model_xgb.score(x_test, y_test)}")

The predict Score Train is ==> 0.999999974026856
%----------------------------------------------------------%
The predict Score Test is ==> 0.9616384501433007


In [16]:
# model = tf.keras.models.Sequential([
                
#                 tf.keras.layers.Flatten(),
#                 tf.keras.layers.Dense(64, activation="linear"),
#                 tf.keras.layers.Dense(64, activation="linear"),
#                 tf.keras.layers.Dense(1)
                
#             ])

# model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.01), 
#               loss=tf.keras.losses.mean_absolute_error,
#               metrics=["accuracy"])

# model.fit(x_train, y_train, epochs=30)