In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Car Price Prediction
### Problem
The given dataset has 26 columns and 206 rows that can be used to predict the value of a car. Build a model to do so.
Other goals are:
> - Which variables are significant in predicting the price of a car
> - How well those variables describe the price of a car
---
---

# 1:Importing modules
---

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

# 2:Reading the data with briefing
---

In [None]:
data=pd.read_csv("../input/car-data/CarPrice_Assignment.csv")
data.head(3)

In [None]:
data.tail(3)

In [None]:
data.info()

In [None]:
data.describe()

## Inference
Here we have described the data and taken a look at the type of features


# 3:Preprocessing data
---

In [None]:
data.isnull().sum()

In [None]:
#manipulate data with another instance
temp=data
temp.columns
temp['CarName'].unique()

In [None]:
## Add car company name 
company_name=temp['CarName'].apply(lambda x:x.split(' ')[0])


In [None]:
temp['Company Name'] = company_name 

In [None]:
temp.head(3)

In [None]:
#Rearrange the colums to make company name before Car Name
cols=temp.columns.to_list()
i=[]
i.append(cols[-1])
cols = cols[:2] + i + cols[2:-1]
temp = temp[cols]

In [None]:
temp.head(4)

In [None]:
temp['Company Name'].unique()

In [None]:
### Correcting the errors as some company names are spelled wrong.
temp['Company Name'].replace(to_replace="maxda",value="mazda",inplace=True)
temp['Company Name'].replace(to_replace="Nissan",value="nissan",inplace=True)
temp['Company Name'].replace(to_replace="porcshce",value="porsche",inplace=True)
temp['Company Name'].replace(to_replace="toyouta",value="toyota",inplace=True)
temp['Company Name'].replace(to_replace="vokswagen",value="volkswagen",inplace=True)
temp['Company Name'].replace(to_replace="vw",value="vokswagen",inplace=True)
temp['Company Name'].unique()

In [None]:
temp.loc[temp.duplicated()].sum() ## Warning shown, hence checking for duplicate copies 

## 3.1:Convert string numbers into int

In [None]:
temp['cylindernumber'].unique()

In [None]:
temp['cylindernumber'].replace(to_replace="four",value=4,inplace=True)
temp['cylindernumber'].replace(to_replace="eight",value=8,inplace=True)
temp['cylindernumber'].replace(to_replace="six",value=6,inplace=True)
temp['cylindernumber'].replace(to_replace="five",value=5,inplace=True)
temp['cylindernumber'].replace(to_replace="three",value=3,inplace=True)
temp['cylindernumber'].replace(to_replace="twelve",value=12,inplace=True)
temp['cylindernumber'].replace(to_replace="two",value=2,inplace=True)
temp.astype({'cylindernumber': 'int32'}).dtypes
temp['cylindernumber'].head(3)

In [None]:
temp['doornumber'].unique()

In [None]:
temp['doornumber'].replace(to_replace="four",value=4,inplace=True)
temp['doornumber'].replace(to_replace="two",value=2,inplace=True)
temp.astype({'doornumber': 'int32'}).dtypes

In [None]:
temp.loc[temp.duplicated()].sum() ## Warning shown, hence checking for duplicate copies 

In [None]:
temp.head(5)

# 4:Data Visualization
---

In [None]:
temp.hist(figsize = (35,30), bins = 40)
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(30, 25))
mat = temp.corr('spearman')
mask = np.triu(np.ones_like(mat, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(mat, mask=mask, cmap=cmap, vmax=1, center=0,square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()


In [None]:
plt.figure(figsize = (20,20))
ax =sns.boxplot(data = temp, orient="h")
ax.set_title('Bank Data Boxplots', fontsize = 18)
ax.set_xscale("log") #Using log scale as some vales are near by and not well represnted in a linear scale 
plt.show()

In [None]:
plt.figure(figsize = (10,10))
sns.boxplot(y="price",data = temp)
plt.title('Car price Boxplots', fontsize = 18)
plt.figure(figsize = (10,10))
sns.displot(x="price",data = temp,kde=True)
plt.title('Car price distplot', fontsize = 18)

## Inference
### Correlations 

```
> Car length : Wheelbase
> Car width : Car length, Wheelbase
> Car height : Door number, Carlength, Wheelbase
> Curb weight : Car width, Car length, Wheelbase
> Engine size : Curb weight, Car width,Car length, Wheelbase
> Horsepower: Engine size, Curb weight, Car width,Car length, Wheelbase
> Price : Cylinder number, Bore ratio, Horsepower, Engine size, Curb weight, Car width,Car length, Wheelbase
> Price : It is alsio inversely related to citympg and highway mpg
```

## 4.1:Close up Visualisation of  Data
```
- CompanyName
- Symboling
- fueltype
- enginetype
- carbody
- doornumber
- enginelocation
- fuelsystem
- cylindernumber
- aspiration
- drivewheelm

```

In [None]:
f, ax = plt.subplots(nrows=3,ncols=3, figsize=(30,30))

sns.histplot(x="Company Name",data = temp,ax=ax[0,0])
ax[0,0].set_title('Company name Histogram', fontsize = 18)
ax[0,0].set_xticklabels(temp['Company Name'].unique().tolist(), rotation=90)

sns.histplot(temp["aspiration"],ax=ax[0,1])
ax[0,1].set_title('Aspiration Histogram', fontsize = 18)

sns.histplot(temp["fueltype"],ax=ax[0,2])
ax[0,2].set_title('Fuel type Histogram', fontsize = 18)


sns.histplot(temp["enginetype"],ax=ax[1,0])
ax[1,0].set_title('Engine type Histogram', fontsize = 18)


sns.histplot(temp["carbody"],ax=ax[1,1])
ax[1,1].set_title('Car body Histogram', fontsize = 18)

sns.histplot(temp["enginelocation"],ax=ax[1,2])
ax[1,2].set_title('Engine location Histogram', fontsize = 18)

sns.histplot(temp["fuelsystem"],ax=ax[2,0])
ax[2,0].set_title('Car body Histogram', fontsize = 18)


sns.histplot(temp["symboling"],ax=ax[2,1])
ax[2,1].set_title('Symboling Histogram', fontsize = 18)

sns.histplot(temp["drivewheel"],ax=ax[2,2])
ax[2,2].set_title('Drive wheel Histogram', fontsize = 18)


## 4.2:Category vs price Visualization

In [None]:
f, ax = plt.subplots(nrows=3,ncols=3, figsize=(30,30))

#sns.boxplot(x=cars.enginetype, y=cars.price, palette=("PuBuGn"))
sns.boxplot(x="Company Name",y="price",data = temp,ax=ax[0,0])
ax[0,0].set_title('Company name  vs Price ', fontsize = 18)
ax[0,0].set_xticklabels(temp['Company Name'].unique().tolist(), rotation=90)

sns.boxplot(x="aspiration",y="price",data = temp,ax=ax[0,1])
ax[0,1].set_title('Aspiration  vs Price', fontsize = 18)

sns.boxplot(x="fueltype",y="price",data = temp,ax=ax[0,2])
ax[0,2].set_title('Fuel type vs Price', fontsize = 18)


sns.boxplot(x="enginetype",y="price",data = temp,ax=ax[1,0])
ax[1,0].set_title('Engine type vs Price', fontsize = 18)


sns.boxplot(x="carbody",y="price",data = temp,ax=ax[1,1])
ax[1,1].set_title('Car body vs Price', fontsize = 18)

sns.boxplot(x="enginelocation",y="price",data = temp,ax=ax[1,2])
ax[1,2].set_title('Engine location vs Price', fontsize = 18)

sns.boxplot(x="fuelsystem",y="price",data = temp,ax=ax[2,0])
ax[2,0].set_title('Car body vs Price', fontsize = 18)


sns.boxplot(x="symboling",y="price",data = temp,ax=ax[2,1])
ax[2,1].set_title('Symboling vs Price', fontsize = 18)

sns.boxplot(x="drivewheel",y="price",data = temp,ax=ax[2,2])
ax[2,2].set_title('Drive wheel vs Price', fontsize = 18)


In [None]:
f, ax = plt.subplots(nrows=3,ncols=3, figsize=(30,30))

sns.barplot(x="Company Name",y="price",data = temp,ax=ax[0,0],palette=("PuBuGn"))
ax[0,0].set_title('Company name  vs Price ', fontsize = 18)
ax[0,0].set_xticklabels(temp['Company Name'].unique().tolist(), rotation=90)

sns.barplot(x="aspiration",y=temp["price"],data = temp,ax=ax[0,1],palette=("PuBuGn"))
ax[0,1].set_title('Aspiration  vs Price', fontsize = 18)
 
sns.barplot(x="fueltype",y="price",data = temp,ax=ax[0,2],palette=("PuBuGn"))
ax[0,2].set_title('Fuel type vs Price', fontsize = 18)
 
 
sns.barplot(x="enginetype",y="price",data = temp,ax=ax[1,0],palette=("PuBuGn"))
ax[1,0].set_title('Engine type vs Price', fontsize = 18)
 
 
sns.barplot(x="carbody",y="price",data = temp,ax=ax[1,1],palette=("PuBuGn"))
ax[1,1].set_title('Car body vs Price', fontsize = 18)
 
sns.barplot(x="enginelocation",y="price",data = temp,ax=ax[1,2],palette=("PuBuGn"))
ax[1,2].set_title('Engine location vs Price', fontsize = 18)
 
sns.barplot(x="fuelsystem",y="price",data = temp,ax=ax[2,0],palette=("PuBuGn"))
ax[2,0].set_title('Car body vs Price', fontsize = 18)
 
sns.barplot(x="symboling",y="price",data = temp,ax=ax[2,1],palette=("PuBuGn"))
ax[2,1].set_title('Symboling vs Price', fontsize = 18)
 
sns.barplot(x="drivewheel",y="price",data = temp,ax=ax[2,2],palette=("PuBuGn"))
ax[2,2].set_title('Drive wheel vs Price', fontsize = 18)
 

## Inference
### Based on the count
```
> Most common car : Toyota
> Most aspired : std
> Most fuel type : Gas
> Most common engine type : ohc
> Most common body type : Sedan
> Most common engine location : Front
> Most common car body type : mpfi
> Most common symboling type: 0
> Most common drive wheel type : fwd 
```   

<br>

### Price vs categories

```
> Companies with most costly cars: bmw, buick, jaguar, porche
> Aspirations : turbmo most costly
> Fuel type : Gas most costly
> Engine type : dohcv most costly
> Body type : Hardtop and convertible most costly
> Engine location : rear location most costly
> Car body type : mpfi most costly
> Symboling type: -1 and 3 types most costly
> Drive wheel type : rwd most costly 
```   

## 4.3:Visualising numerical data¶

In [None]:
f, ax = plt.subplots(nrows=3,ncols=3, figsize=(30,30))
sns.scatterplot(x='doornumber',y='price',data=temp,ax=ax[0,0])
ax[0,0].set_title("Doornumber vs Price",fontsize=18)

sns.scatterplot(x='cylindernumber',y='price',data=temp,ax=ax[0,1])
ax[0,1].set_title("Cylindernumber vs Price",fontsize=18)

sns.scatterplot(x='carlength',y='price',data=temp,ax=ax[0,2])
ax[0,2].set_title("Car length vs Price",fontsize=18)

sns.scatterplot(x='carwidth',y='price',data=temp,ax=ax[1,0])
ax[1,0].set_title("Car width vs Price",fontsize=18)

sns.scatterplot(x='carheight',y='price',data=temp,ax=ax[1,1])
ax[1,1].set_title("Car height vs Price",fontsize=18)

sns.scatterplot(x='curbweight',y='price',data=temp,ax=ax[1,2])
ax[1,2].set_title("Curbweight vs Price",fontsize=18)

sns.scatterplot(x='boreratio',y='price',data=temp,ax=ax[2,0])
ax[2,0].set_title("Boreratio vs Price",fontsize=18)

sns.scatterplot(x='horsepower',y='price',data=temp,ax=ax[2,1])
ax[2,1].set_title("Horsepower vs Price",fontsize=18)

sns.scatterplot(x='wheelbase',y='price',data=temp,ax=ax[2,2])
ax[2,2].set_title("Wheelbase vs Price",fontsize=18)

f, ax = plt.subplots(ncols=2, figsize=(20,10))
sns.scatterplot(x='citympg',y='price',data=temp,ax=ax[0])
ax[0].set_title("Citympg vs Price",fontsize=18)

sns.scatterplot(x='highwaympg',y='price',data=temp,ax=ax[1])
ax[1].set_title("Highwaympg vs Price",fontsize=18)

In [None]:
f, ax = plt.subplots(nrows=3,ncols=3, figsize=(30,30))
sns.lineplot(x='doornumber',y='price',data=temp,ax=ax[0,0])
ax[0,0].set_title("Doornumber vs Price",fontsize=18)

sns.lineplot(x='cylindernumber',y='price',data=temp,ax=ax[0,1])
ax[0,1].set_title("Cylindernumber vs Price",fontsize=18)

sns.lineplot(x='carlength',y='price',data=temp,ax=ax[0,2])
ax[0,2].set_title("Car length vs Price",fontsize=18)

sns.lineplot(x='carwidth',y='price',data=temp,ax=ax[1,0])
ax[1,0].set_title("Car width vs Price",fontsize=18)

sns.lineplot(x='carheight',y='price',data=temp,ax=ax[1,1])
ax[1,1].set_title("Car height vs Price",fontsize=18)

sns.lineplot(x='curbweight',y='price',data=temp,ax=ax[1,2])
ax[1,2].set_title("Curbweight vs Price",fontsize=18)

sns.lineplot(x='boreratio',y='price',data=temp,ax=ax[2,0])
ax[2,0].set_title("Boreratio vs Price",fontsize=18)

sns.lineplot(x='horsepower',y='price',data=temp,ax=ax[2,1])
ax[2,1].set_title("Horsepower vs Price",fontsize=18)

sns.lineplot(x='wheelbase',y='price',data=temp,ax=ax[2,2])
ax[2,2].set_title("Wheelbase vs Price",fontsize=18)

f, ax = plt.subplots(ncols=2, figsize=(20,10))
sns.lineplot(x='citympg',y='price',data=temp,ax=ax[0])
ax[0].set_title("Citympg vs Price",fontsize=18)

sns.lineplot(x='highwaympg',y='price',data=temp,ax=ax[1])
ax[1].set_title("Highwaympg vs Price",fontsize=18)

## Inference
### Numeric data
```
> Cylinder number has positive correlation with the price
> Car length has positive correlation with the price 
> Car width has positive correlation with the price
> Car height has no significant correlation with the price
> Curb weight has positive correlation with the price
> Horse power has positive correlation with the price
> Wheel base has positive correlation with the price
> Bore ratio has positive correlation with the price
> Citympg has negative correlation with the price
> Highway has negative correlation with the price
```

In [None]:
### Make categories based on average price
cars=temp.copy()
cars.groupby(['Company Name'])['price'].mean().reset_index()
flag=cars.merge(cars.groupby(['Company Name'])['price'].mean().reset_index(),how='left',on="Company Name")
table=pd.cut(flag['price_y'],bins=[0,10000,20000,40000],labels=['Budget','Medium','Luxury'],right=False)
temp['range']=table
temp.head()

In [None]:
### Convert Catregories into Numerical for regression 
# Defining the map function
def convert_label(label,data):
    temp = pd.get_dummies(data[label])
    data = pd.concat([data, temp], axis = 1)
    data=data.drop([label], axis = 1)
    return data
temp = convert_label('fueltype',temp)
temp = convert_label('carbody',temp)
temp = convert_label('enginetype',temp)
temp = convert_label('aspiration',temp)
temp = convert_label('drivewheel',temp)
temp = convert_label('enginelocation',temp)
temp = convert_label('fuelsystem',temp)
temp = convert_label('range',temp)
temp = convert_label('Company Name',temp)

In [None]:
temp.shape

In [None]:
temp.head()

# 5:Train-Test Split and feature scaling
---



In [None]:
temp.columns

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
header = ['wheelbase', 'curbweight','enginesize','horsepower','citympg','highwaympg','carlength','carwidth','price']
temp[header] = scaler.fit_transform(temp[header])
temp.head()

In [None]:
labels=pd.DataFrame({'price':temp['price'].to_list()})
labels.head()

In [None]:
dataset=temp.drop(['price','carheight','CarName','car_ID'],axis=1)
dataset.head()

In [None]:
from sklearn.model_selection import train_test_split 
X_raw,X_test,y_raw,y_test  = train_test_split(dataset,
                                              labels,
                                              test_size=0.2,
                                              random_state = 1)



In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_raw,y_raw)


In [None]:
pred=model.predict(X_test)
model.score(X_test,y_test)

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, pred)


In [None]:
predictions= pred.flatten()

In [None]:
my_submission = pd.DataFrame({'Id': y_test.index, 'Price': predictions})
# you could use any filename. We choose submission here
my_submission.to_csv('./submission.csv', index=False)