# Import modules and load data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import warnings
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.ensemble import RandomForestRegressor
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('//Users/riteshkumar/Downloads/ML projects/Car price prediction/car_price_prediction.csv')

In [None]:
df.head()

# Exploring the data

In [None]:
df.shape

In [None]:
df.describe()


In [None]:
df.info()

## Notes:
- Id is not useful
- Engine volume and Mileage are object instead of numerical data type
- Cylinders can be integer

In [None]:
df.duplicated().sum()

In [None]:
df.isna().sum()

In [None]:
# get the most 5 frequent items in each column in a new dataframe
most_frequent_items = df.apply(lambda x: x.value_counts().head(5).index.tolist())
most_frequent_items_df = pd.DataFrame(most_frequent_items)
most_frequent_items_df


# Data cleaning


In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.drop('ID', axis=1, inplace=True)

In [None]:
df['Levy'].value_counts().head() / len(df)*100

30% of this column is `-`

In [None]:
df['Levy'] = df['Levy'].replace('-', 0)
df['Levy'] = df['Levy'].astype(float)

In [None]:
df['Engine volume'] = df['Engine volume'].str.split(' ').str[0].astype(float)

In [None]:
df['Mileage']= df['Mileage'].str.split(' ').str[0].astype(float)

In [None]:
df['Cylinders'] = df['Cylinders'].astype(int)

In [None]:
now_year = dt.datetime.now().year
df['Age'] = now_year - df['Prod. year']
df.drop('Prod. year', axis=1, inplace=True)

In [None]:
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = df.select_dtypes(include=['object']).columns

In [None]:
Q1 = df[numerical_columns].quantile(0.25)
Q3 = df[numerical_columns].quantile(0.75)
IQR = Q3 - Q1
lowerBound = Q1 - 1.5 * IQR
upperBound = Q3 + 1.5 * IQR
df = df[~((df[numerical_columns] < lowerBound) | (df[numerical_columns] > upperBound)).any(axis=1)]

# Exploratory Data Analysis


In [None]:
plt.figure(figsize=(10, 8))

for i, column in enumerate(numerical_columns):
    plt.subplot(4,2,i + 1)
    sns.histplot(df[column], bins=20, alpha=0.5, label=column, kde=True)
    plt.legend()
    plt.xlabel(column)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))

for i, column in enumerate(categorical_columns):
    plt.subplot(4,3,i + 1)
    sns.countplot(x = df[column],order = df[column].value_counts().index[0:5])
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(10, 8))

for i, column in enumerate(categorical_columns):
    plt.subplot(3,4,i + 1)
    sns.boxplot(y=df['Price'],x = df[column],order = df[column].value_counts().index[0:5])
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
encoder = LabelEncoder()
for column in categorical_columns:
    df[column] = encoder.fit_transform(df[column])
df.sample(5)

In [None]:
corr = df.corr()
corr

In [None]:
plt.figure(figsize=(15, 10))
sns.heatmap(corr, annot=True)
plt.show()

# Results:
- Most cars have 4 cylinders
- `Toyota` and `Hyundai` are the most common cars
- Most of the cars come with petrol engine and are automatic gear
- Car price tend to be affected by the age of the car(the production year)
- Engine volume and the wheel type have effeect on car price

In [None]:
x=df.drop(['Price','Color','Doors','Cylinders','Drive wheels'],axis=1)
y = df['Price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [None]:
print(r2_score(y_test, y_pred)  )
print(mean_squared_error(y_test, y_pred))

              If you find this notebook useful .. then I'll be thankful for ur upvote
