## Importing libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
import warnings
warnings.filterwarnings('ignore')

## 1. Load data

In [None]:
df = pd.read_csv('data/car_price_dataset.csv')

In [None]:
# print the first rows of data
df.head()

In [None]:
# Check the shape of your data
df.shape

In [None]:
# Statistical info Hint: look up .describe()
df.describe()

In [None]:
# Check Dtypes of your input data
df.info()

In [None]:
# Check the column names
df.columns

## 2. Exploratory Data Analysis

EDA is an essential step to inspect the data, so to better understand nature of the given data.

### Renaming

Now we would like to rename some of the following column names, so it's easy to write the code...

In [None]:
df.columns

In [None]:
# rename columns
df.rename(columns = {'name':'brand', 
                     }, inplace = True)

In [None]:
df.columns

### 2.1 Univariate analyis

Single variable exploratory data anlaysis

#### Countplot

#### Distribution plot

In [None]:
sns.displot(data = df, x = 'year')

### 2.2 Multivariate analysis

Multiple variable exploratory data analysis

#### Boxplot

In [None]:
# Let's try bar plot on "Status"
sns.boxplot(x = df["transmission"], y = df["year"])
plt.ylabel("transmission")
plt.xlabel("year")

#### Scatterplot

In [None]:
sns.scatterplot(x = df['selling_price'], y = df['year'], hue=df['transmission'])

#### Correlation Matrix

Let's use correlation matrix to find strong factors predicting the life expectancy.  It's also for checking whether certain features are too correlated.

In [None]:
# Let's check out heatmap
plt.figure(figsize = (15,8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")  #don't forget these are not all variables! categorical is not here...

#### Tips: Label encoding

Now we would like to change "Developing" and "Developed" to "0" and "1", since machine learning algorithms do not understand text.   Also, correlation matrix and other similar computational tools require label encoding.

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["owner"] = le.fit_transform(df["owner"])

df["owner"].unique()

In [None]:
# we can call le.classes_ to know what it maps to
le.classes_

In [None]:
# Define the mapping for label encoding
owner_mapping = {
    'First Owner': 1,
    'Second Owner': 2,
    'Third Owner': 3,
    'Fourth & Above': 4,
    'Test Drive': 5
}

# Apply label encoding using the defined mapping
df['owner'] = df['owner'].replace(owner_mapping)

In [None]:
# Remove rows with 'CNG' and 'LPG' in the 'fuel' column
df = df[~df['fuel'].isin(['CNG', 'LPG'])]

In [None]:
# Extract numeric mileage values by splitting and converting to float
df['mileage'] = df['mileage'].str.split().str[0].astype(float)

In [None]:
# Remove "CC" and convert to float
df['engine'] = df['engine'].str.replace(' CC', '').astype(float)

In [None]:
# Remove " bhp" and convert to float, handling N/A values
df['max_power'] = df['max_power'].str.replace(' bhp', '')
df['max_power'] = pd.to_numeric(df['max_power'], errors='coerce') #It should be float.

In [None]:
# Extract the first word and update the column
df['brand'] = df['brand'].apply(lambda x: x.split()[0])

In [None]:
# Drop the 'torque' feature
df = df.drop(columns=['torque'])

In [None]:
df.head(10)

In [None]:
# Let's check out heatmap
plt.figure(figsize = (15,8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")  #don't forget these are not all variables! categorical is not here...

#### Predictive Power Score

This is another way to check the predictive power of some feature.  Unlike correlation, `pps` actually obtained from actual prediction.  For more details:
    
- The score is calculated using only 1 feature trying to predict the target column. This means there are no interaction effects between the scores of various features. Note that this is in contrast to feature importance
- The score is calculated on the test sets of a 4-fold crossvalidation (number is adjustable via `ppscore.CV_ITERATIONS`)
- All rows which have a missing value in the feature or the target column are dropped
- In case that the dataset has more than 5,000 rows the score is only calculated on a random subset of 5,000 rows with a fixed random seed (`ppscore.RANDOM_SEED`). You can adjust the number of rows or skip this sampling via the API. However, in most scenarios the results will be very similar.
- There is no grid search for optimal model parameters

We can install by doing <code>pip install ppscore</code>

In [None]:
import ppscore as pps

# before using pps, let's drop country and year
dfcopy = df.copy()
dfcopy.drop(['brand', 'year'], axis='columns', inplace=True)

#this needs some minor preprocessing because seaborn.heatmap unfortunately does not accept tidy data
matrix_df = pps.matrix(dfcopy)[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')

#plot
plt.figure(figsize = (15,8))
sns.heatmap(matrix_df, vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=True)