# CAR PRICE PREDICTION

Import Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import xticks
from sklearn.metrics import mean_squared_error, mean_absolute_error
# supress warnings
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

Membaca File CSV

In [None]:
car=pd.read_csv('quikr_car.csv')

# ===================| PRE-PROCESSING |=======================

## Sebelum Pre-Processing

In [None]:
car

In [None]:
car.describe()

In [None]:
car.info()

In [None]:
car.shape

## PRE-PROCESSING DATA

### CLEANING DATA

In [None]:
backup=car.copy()

### Menghapus Duplicated

In [None]:
car.duplicated().sum()

In [None]:
car=car.drop_duplicates()

### Menghapus Data yang kosong (Handling Missing Value)

In [None]:
car=car.dropna()

### Menghapus non year values

In [None]:
car=car[car['year'].str.isnumeric()]

### Mengganti Dtype year dari object ke int

In [None]:
car['year']=car['year'].astype(int)

### Menghapus Ask For Price Di Kolom Price

In [None]:
car=car[car['Price']!="Ask For Price"]

### Menghapus tanda koma di kolom price dan menganti Dtype dari object ke int

In [None]:
car['Price']=car['Price'].str.replace(',','').astype(int)

### Menghapus tanda koma di kolom kms_driven dan menganti Dtype dari object ke int

In [None]:
car['kms_driven']=car['kms_driven'].str.split(' ').str.get(0).str.replace(',','')

### Menghapus non kms_driven

In [None]:
car=car[car['kms_driven'].str.isnumeric()]

### Mengganti Dtype kms_driven dari object ke int

In [None]:
car['kms_driven']=car['kms_driven'].astype(int)

### Menditeksi Data Yang hilang di fuel_type

In [None]:
car=car[~car['fuel_type'].isna()]

### Menggambil 3 kata pertama di kolom name

In [None]:
car['name']=car['name'].str.split(' ').str.slice(0,3).str.join(' ')

### Setel ulang index

In [None]:
car=car[car['Price']<6e6].reset_index(drop=True)

In [None]:
car=car.reset_index(drop=True)

# Setelah Pre-Processing

In [None]:
car

In [None]:
car.describe()

In [None]:
car.info()

In [None]:
car.shape

#  =======| Visualisasikan Data Setelah Pre-Processing |=======

In [None]:
kategori=car[['company','year','Price','kms_driven','fuel_type']]
kategori

### Visualisasi numeric variable

In [None]:
plt.figure(figsize=(15, 15))
sns.pairplot(kategori)
plt.show()

### check relasi antara company dengan price

In [None]:
plt.subplots(figsize=(15,7))
ax=sns.boxplot(x='company',y='Price',data=car)
ax.set_xticklabels(ax.get_xticklabels(),rotation=40,ha='right')
plt.show()

### check relasi antara year dengan price

In [None]:
plt.subplots(figsize=(20,10))
ax=sns.swarmplot(x='year',y='Price',data=car)
ax.set_xticklabels(ax.get_xticklabels(),rotation=40,ha='right')
plt.show()

### check relasi antara kms_driven dengan price

In [None]:
sns.relplot(x='kms_driven',y='Price',data=car,height=7,aspect=1.5)

### check relasi antara fuel type dengan price

In [None]:
plt.subplots(figsize=(14,7))
sns.boxplot(x='fuel_type',y='Price',data=car)

### Check Relasi Antara company dan fuel type dengan count

In [None]:
plt.subplots(figsize=(15,7))
ax=sns.countplot('company',data=car,hue='fuel_type')
ax.set_xticklabels(ax.get_xticklabels(),rotation=40,ha='right')
plt.show()

### check relasi antara harga dengan fuel type, year and company

In [None]:
ax=sns.relplot(x='company',y='Price',data=car,hue='fuel_type',size='year',height=7,aspect=2)
ax.set_xticklabels(rotation=40,ha='right')

# =================| ML ( Multiple Linear Regression) |======================

### Model

In [None]:
car.fillna(car.mean(),inplace=True)
X= car.drop(['Price'],axis=1)
y= car['Price']

In [None]:
from sklearn.model_selection import train_test_split
train_test_split(X,y,test_size=0.2,random_state=0)

In [None]:
x_train, x_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=0)

### Linear Regression

Label = Price

Features = Name, Company, Year, Kms_driven, fuel_type

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [None]:
ohe= OneHotEncoder()
ohe.fit(X[['name','company','fuel_type']])

#mengubah data kategorikal menjadi data int

In [None]:
ohe.categories_

In [None]:
column_trans = make_column_transformer((OneHotEncoder(categories=ohe.categories_),['name','company','fuel_type']),
                                        remainder='passthrough')

In [None]:
lr=LinearRegression()

In [None]:
pipe=make_pipeline(column_trans,lr)

In [None]:
pipe.fit(x_train,y_train)

In [None]:
y_pred=pipe.predict(x_test)

In [None]:
# Untuk Testing Accuracy
scores=[]
for i in range(1000):
        x_train, x_test, y_train, y_test=train_test_split(X,y,test_size=0.2, random_state=i)
        lr=LinearRegression()
        pipe=make_pipeline(column_trans,lr)
        pipe.fit(x_train,y_train)
        y_pred=pipe.predict(x_test)
        scores.append(pipe.score(x_test,y_test))

In [None]:
scores=scores[np.argmax(scores)]

# =====| Visualisasi Data Setelah Proses ML (Linear Regression) |=======

In [None]:
fig = plt.figure()
plt.scatter(y_test,y_pred)
fig.suptitle('y_test vs y_pred', fontsize=20)            
plt.xlabel('y_test', fontsize=18)                          
plt.ylabel('y_pred', fontsize=16)   

# ====================| Training Accuracy  |=======================

In [None]:
print('Training Accuracy :',pipe.score(x_train,y_train))

# ====================| Testing  Accuracy |=======================

In [None]:
print('Testing accuracy :',scores)

##Koefisien##

# ====================| Testing Prediction |=======================

In [None]:
companyy=input("company : ")
modell=input("model : ")
yearr=int(input("Tahun : "))
kms_drivenn=int(input("kms : "))
fuel_typee=input("Fuel : ")

pipe.predict(pd.DataFrame(columns=['name','company','year','kms_driven','fuel_type'],data=np.array([modell,companyy,yearr,kms_drivenn,fuel_typee]).reshape(1,5)))

company : Honda
