In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings('ignore')

In [None]:
cars_data = pd.read_csv('data/vehicles.csv')

In [None]:
cars_data.info()

In [None]:
cars_data.head()

In [None]:
# Drop rows which has more than half entries as NaN
cars_data_cleaned = cars_data.dropna(thresh=8)

In [None]:
cars_data_cleaned.head()

In [None]:
cars_data_cleaned.set_index('id')

In [None]:
# Plot number of cars sold per manufacturer
cars_by_manufacturer_type = pd.DataFrame()
cars_by_manufacturer_type['count'] = pd.DataFrame(cars_data_cleaned.groupby(['manufacturer', 'type'])['type'].count())
cars_by_manufacturer_type = cars_by_manufacturer_type.reset_index()

plt.figure(figsize=(18,10))
ax = sns.histplot(cars_by_manufacturer_type, x='manufacturer',weights = 'count',hue = 'type', multiple='stack')
plt.xticks(rotation=90) 
plt.xlabel('Manufacturer',fontsize=16)
plt.ylabel('Number of cars sold',fontsize=16)
plt.title('Total number of cars sold per each manufacturer',fontsize=16)
plt.show()

In [None]:
# Plot number of cars sold per manufacturer per state
cars_by_manufacturer_state = pd.DataFrame()
cars_by_manufacturer_state['count'] = pd.DataFrame(cars_data_cleaned.groupby(['manufacturer', 'state'])['manufacturer'].count())
cars_by_manufacturer_state = cars_by_manufacturer_state.reset_index()

plt.figure(figsize=(18,14))
ax = sns.histplot(cars_by_manufacturer_state, x='state',weights = 'count',hue = 'manufacturer', multiple='stack')
plt.xticks(rotation=90) 
plt.xlabel('Manufacturer',fontsize=16)
plt.ylabel('Number of cars sold',fontsize=16)
plt.title('Total number of cars sold in each state per each manufacturer',fontsize=16)
plt.show()

In [None]:
# Create linear regression model
odo_price_data = cars_data[['odometer','price']].dropna()
odometer       = odo_price_data[['odometer']]
price          = odo_price_data['price']
lin_model = linear_model.LinearRegression(fit_intercept=False)
lin_model.fit(odometer, price)
lin_model.coef_

In [None]:
# Create linear regression model using multiple features
odo_year_price_data = cars_data[['odometer', 'year','price']].dropna()
features = odo_year_price_data[['odometer', 'year']]
price = odo_year_price_data['price']
lin_model_2d = linear_model.LinearRegression(fit_intercept=False)
lin_model_2d.fit(features, price)
lin_model_2d.coef_

In [None]:
odo_year_price_data['prediction']    = lin_model.predict(odo_year_price_data[['odometer']])
odo_year_price_data['prediction_2d'] = lin_model_2d.predict(odo_year_price_data[['odometer','year']])
odo_year_price_data

In [None]:
mean_squared_error(odo_year_price_data['prediction'],odo_year_price_data['price'])

In [None]:
mean_squared_error(odo_year_price_data['prediction_2d'],odo_year_price_data['price'])

In [None]:
#Calculate linear regression
cars_features_data = cars_data[['manufacturer', 'condition','paint_color','price']].dropna()

features_data = cars_features_data[['manufacturer', 'condition','paint_color']]
price_data = cars_features_data[['price']]

condition_dummies = pd.get_dummies(features_data['condition'], dtype=int)
paint_color_dummies = pd.get_dummies(features_data['paint_color'], dtype=int)
manufacturer_dummies = pd.get_dummies(features_data['manufacturer'], dtype=int)

data_w_dummies = pd.concat([features_data, condition_dummies, paint_color_dummies, manufacturer_dummies],axis=1)
data_w_dummies = data_w_dummies.drop(['manufacturer', 'condition', 'paint_color'], axis=1)

lin_reg = linear_model.LinearRegression(fit_intercept=False)
lin_reg.fit(data_w_dummies, price_data)

lin_reg.coef_

In [None]:
# Linear Regression 

cars_data_cleaned = cars_data[['manufacturer', 'state','paint_color','price','odometer']].dropna()

# cars_data_cleaned["state_num"] = labelencoder.fit_transform(cars_data_cleaned["state"])
# cars_data_cleaned["state_num"] = cars_data_cleaned["state_num"].astype('category')
# print(cars_data_cleaned)
cars_data_cleaned = pd.get_dummies(cars_data_cleaned,columns=['state'],dtype=int, drop_first=True)

# cars_data_cleaned["color_num"] = labelencoder.fit_transform(cars_data_cleaned["paint_color"])
# cars_data_cleaned["color_num"] = cars_data_cleaned["color_num"].astype('category')

cars_data_cleaned = pd.get_dummies(cars_data_cleaned,columns=['paint_color'],dtype=int, drop_first=True)

# cars_data_cleaned["manufacturer_num"] = labelencoder.fit_transform(cars_data_cleaned["manufacturer"])
# cars_data_cleaned["manufacturer_num"] = cars_data_cleaned["manufacturer_num"].astype('category')

cars_data_cleaned = pd.get_dummies(cars_data_cleaned,columns=['manufacturer'],dtype=int, drop_first=True)

lin_reg = LinearRegression()

X = cars_data_cleaned.drop('price', axis=1)
y = cars_data_cleaned['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

lin_reg.fit(X_train,y_train)

y_pred = lin_reg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)

print(mse)


In [None]:
# Ridge model analysis
ridge_model = Ridge().fit(X_train, y_train)
ridge_model_coefs = ridge_model.coef_
print(f'Ridge Coefs: {np.round(ridge_model_coefs, 2)}')

In [None]:
# Lasso model analysis
lasso_model = Lasso().fit(X_train, y_train)
lasso_model_coefs = lasso_model.coef_
print(f'Lasso Coefs: {np.round(lasso_model_coefs, 2)}')

In [None]:
# Exploring alphas
alphas = [0.001, 1.0, 10.0, 100.0]

coef_list = []

# YOUR CODE HERE

for a in alphas:
    ridge = Ridge(alpha = a)
    ridge.fit(X_train, y_train)
    coef_list.append(list(ridge.coef_))

# Answer check
len(coef_list)
print('For alpha = 100 we have the following coefficients:')
list(zip(X_train.columns, coef_list[-1]))

In [None]:
# Hyper parameter evaluation with Ridge regression

alphas = {'alpha':[1e-2,1,5,10,20,30,50,100]}
gridSearch = GridSearchCV(ridge_model, alphas, scoring='neg_mean_squared_error', cv=5)
gridSearch.fit(X,y)
print("Best value for lambda : ",gridSearch.best_params_)
print("Best score for cost function: ", gridSearch.best_score_)

In [None]:
# Hyper parameter evaluation with Lasso regression
# alphas = {'alpha':[1e-2,1,5,10,20,30,50,100]}
# gridSearch = GridSearchCV(lasso_model, alphas, scoring='neg_mean_squared_error', cv=5)
# gridSearch.fit(X,y)
# print("Best value for lambda : ",gridSearch.best_params_)
# print("Best score for cost function: ", gridSearch.best_score_)

In [None]:
# Calculate ridge prediction
ridge_predict = ridge_model.predict(X_test)
ridge_predict

In [None]:
sns.distplot(ridge_predict)