In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re


from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble  import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRFRegressor

## Downloading Data

In [None]:
path = "/content/drive/MyDrive/Data Science/Project-31 Build Car Prices Prediction App/Car details.csv"

In [None]:
df = pd.read_csv(path)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# checking null values
df.isna().sum()

In [None]:
# droping null values
df.dropna(inplace=True)

## Feature Engineering

#### Tourque

In [None]:
df['torque'].value_counts()

In [None]:
def clean_turque(x):
  torque_rpm = []
  for item in x:
    if item.isnumeric():
      torque_rpm.append(item)
  return np.int32(''.join(torque_rpm[-4:]))

In [None]:
df['RPM'] = df['torque'].apply(clean_turque).astype('int32')

In [None]:
def get_torque(x):
  x = x.replace('at', '@')
  x = x.replace('(', '@')
  x = x.replace('/', '@')
  t = x.split('@')[0]

  if "kgm" in t or 'KGM' in t:
    t = t.replace("kgm", '')
    t = t.replace('KGM', '')
    return int(float(t) * 9.80664999999931)

  elif 'Nm' in t or 'nm' in t or "NM" in t:
    t = t.replace('Nm', '')
    t = t.replace('nm', '')
    t = t.replace('NM', '')
    return int(float(t))

  elif '.' in t:
    x = t.split('.')[0]
    if len(x) == 3:
      return int(x)
    else:
      return int(float(x)* 9.80664999999931 )
      
  else:
    if len(t) == 2:
      t = np.int32(t)
      return int(t * 9.80664999999931)
    return int(t)
 

In [None]:
df['Torque'] = df['torque'].apply(get_torque)
df.drop('torque', axis=1, inplace=True)
df.head()

## Seller_type, transmission, fuel, owner

In [None]:
df['seller_type'].value_counts()

In [None]:
df['fuel'].value_counts()

In [None]:
df['transmission'].value_counts()

In [None]:
df['owner'] = df['owner'].map({'First Owner':0, 'Second Owner':1, 'Third Owner':2, 'Fourth & Above Owner':2,'Test Drive Car':2})
df['owner'].value_counts()

In [None]:
df = pd.get_dummies(data=df, columns=['owner'])
df.head()

In [None]:
df = pd.get_dummies(data=df, columns=['seller_type', 'transmission', 'fuel'], drop_first=True)
df.head()

### Milages

In [None]:
df['mileage'].value_counts()

In [None]:
def mileage_clean(x):
  m = x.split(' ')[0]
  return float(m)

In [None]:
df['mileage'] = df['mileage'].apply(mileage_clean)

### Engine

In [None]:
df['engine'].value_counts()

In [None]:
df['engine'] = df['engine'].apply(lambda x: int(x.split(' ')[0]))

### max_power

In [None]:
df['max_power'].value_counts()

In [None]:
df['max_power'] = df['max_power'].apply(lambda x: np.float64(x.split(' ')[0]))

In [None]:
df.head()

In [None]:
plt.figure(figsize=(14, 10), dpi=200)
sns.heatmap(df.corr(), annot=True, cmap='viridis', linewidths=0.5)
plt.show()

In [None]:
df.info()

In [None]:
# splitting data into depandent and indepandent 
x = df.drop(["name", 'selling_price'], axis=1)
y = df['selling_price'].values

In [None]:
# y = np.log(y)

In [None]:
# splitting data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=101)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

## building the model

In [None]:
models = [LinearRegression, RandomForestRegressor, DecisionTreeRegressor]

In [None]:
def get_score(model):
  model = model()
  model.fit(x_train, y_train)

  print(type(model).__name__)
  print('The training Dataset Score is', model.score(x_train, y_train))
  print('The testing Dataset Score is', model.score(x_test, y_test))
  return model

In [None]:
all_models = {}

for i in models:
  model = get_score(i)
  all_models[type(model).__name__] = model

## after StandardScaler data

In [None]:
scaler = MinMaxScaler().fit(x)
x = scaler.transform(x)

In [None]:
# splitting data into training and testing sets
x_train_, x_test_, y_train_, y_test_ = train_test_split(x, y, test_size=0.2, random_state=101)

In [None]:
def get_score_2(model):
  model = model()
  model.fit(x_train_, y_train_)

  print(type(model).__name__)
  print('The training Dataset Score is', model.score(x_train_, y_train_))
  print('The testing Dataset Score is', model.score(x_test_, y_test_))
  return model

In [None]:
all_models_2 = {}

for i in models:
  model = get_score_2(i)
  all_models_2[type(model).__name__] = model

In [None]:
model = all_models['RandomForestRegressor']

In [None]:
prediction = model.predict(x_test)

In [None]:
metrics.r2_score(y_test, prediction)

In [None]:
metrics.mean_absolute_error(y_test, prediction)

In [None]:
metrics.mean_squared_error(y_test, prediction)

In [None]:
y_test[10], prediction[10]