# In this Kernel I will conduct a rather simplified EDA and predict pricing results using several simple regression
Models that will be used will be models such as - KNN, linear regression & simple tree regression

Changelist commits - 
1. First EDA & data cleaning - commit
2. Starting to work and predicting the price using a simple regression - commit
3. Adding categorical features - commit
4. Adding a KNN regression - commit

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os
print(os.listdir("../input"))

In [None]:
vehicles_df = pd.read_csv('../input/craigslistVehicles.csv')

In [None]:
vehicles_df.info()

In [None]:
vehicles_df.head()

# **Lets drop all useless columns at this stage (images, links etc)**

In [None]:
vehicles_df = vehicles_df.drop(columns=['city_url', 'image_url', 'lat', 'long'])

In [None]:
vehicles_df.shape

# Lets drop duplicates, massive Nans and illogic pricings

In [None]:
vehicles_df.drop_duplicates(subset='url')
vehicles_df.shape

Finding average amount of Nans and dropping rows with more Nans than 95% quntile (9 missing values and more are dropped)

In [None]:
vehicles_df.isnull().sum(axis=1).quantile(.95)

# 9 missing values per row or more are being dropped (~9300 rows dropped)

In [None]:
vehicles_df = vehicles_df[vehicles_df.isnull().sum(axis=1) < 9]
vehicles_df.shape

# Now we drop all prices that are equal to 0 (approximately 45k cars!) 

# +

# all crazy high irrelevant prices of cars - above 100k (~460 prices) - some of those are just wrong due to an addition of 0 in comparison to the description

In [None]:
vehicles_df = vehicles_df[vehicles_df.price != 0]
vehicles_df.shape

In [None]:
plt.figure(figsize=(3,6))
sns.boxplot(y='price', data=vehicles_df);

In [None]:
vehicles_df = vehicles_df[vehicles_df.price < 100000]
vehicles_df.shape

# Looking at the relevant years -

In [None]:
plt.figure(figsize=(15,9))
ax = sns.countplot(x='year',data=vehicles_df);
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right",fontsize=10);

# We decide to keep only cars with year above the year of 1985 (~18k)

In [None]:
vehicles_df = vehicles_df[vehicles_df.year > 1985]
vehicles_df.shape

# Odometer / Milage  ("A typical mileage before overhaul for trucks is around 700K - 1000K miles") - dropping all mileage above 1000k Miles - usually due to wrong adding 0 to the final result (~1150 cars).

In [None]:
vehicles_df.odometer.quantile(.999)

In [None]:
vehicles_df = vehicles_df[~(vehicles_df.odometer > 500000)]
vehicles_df.shape

In [None]:
plt.figure(figsize=(3,6))
sns.boxplot(y='odometer', data=vehicles_df);

In [None]:
vehicles_df.shape

# Now we can start working on the columns that could[](http://) predict price - 
(final shape after cleaning - 474166, 18)

In [None]:
sns.set(style="ticks", color_codes=True)
sns.pairplot(vehicles_df, hue="condition");

# Start with a simple Linear Regression
using only numeric features

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import train_test_split as split
import warnings
from sys import modules

In [None]:
vehicles_df_to_learn = vehicles_df[['odometer','year','price']]

for now we will have to drop rows with odometer as Nan (just for simplicity)

In [None]:
vehicles_df_to_learn = vehicles_df_to_learn.dropna()
vehicles_df_to_learn.shape

In [None]:
vehicles_df_train, vehicles_df_test = split(vehicles_df_to_learn, train_size=0.6, random_state=4222)

In [None]:
X_train = vehicles_df_train[['odometer','year']]
y_train = vehicles_df_train['price']

In [None]:
cars_lm = LinearRegression(fit_intercept=True)

In [None]:
cars_lm.fit(X_train, y_train)

In [None]:
print("The model intercept is: {}".format(cars_lm.intercept_))
print("The model coefficients are: {}".format(cars_lm.coef_[0]))

In [None]:
X_train['Price_prediction'] = cars_lm.predict(X_train)
X_train.head()

In [None]:
cars_train_rmse = np.sqrt(MSE(y_train, X_train['Price_prediction']))
print("RMSE = {:.2f}".format(cars_train_rmse))

In [None]:
cars_lm_test = LinearRegression()

In [None]:
X_test = vehicles_df_test[['odometer','year']]
y_test = vehicles_df_test['price']

In [None]:
cars_lm_test.fit(X_test, y_test)

In [None]:
X_test['price_prediction'] = cars_lm_test.predict(X_test)
X_test.head()

In [None]:
cars_test_rmse = np.sqrt(MSE(y_test, X_test['price_prediction']))
print("RMSE = {:.2f}".format(cars_test_rmse))

Continuing with the simplicity - adding categorical parameters and lets see if the prediction improves:

condition

title_status

transmission

In [None]:
vehicles_df_to_learn2 = vehicles_df[['odometer','year','price', 'transmission', 'title_status', 'condition']]

In [None]:
vehicles_df_to_learn2.info()

we can see that the condition has 200000 nans and therefor we will not include this parameter atm

In [None]:
vehicles_df_to_learn2 = vehicles_df[['odometer','year','price', 'transmission', 'title_status']]
vehicles_df_to_learn2 = vehicles_df_to_learn2.dropna()
vehicles_df_to_learn2.shape

In [None]:
vehicles_df_to_learn2.head()

In [None]:
vehicles_df_to_learn2['transmission_automatic'] = vehicles_df_to_learn2['transmission'].apply(lambda x: 1 if x == 'automatic' else 0)
vehicles_df_to_learn2['transmission_manual'] = vehicles_df_to_learn2['transmission'].apply(lambda x: 1 if x == 'manual' else 0)
vehicles_df_to_learn2['transmission_other'] = vehicles_df_to_learn2['transmission'].apply(lambda x: 1 if x == 'other' else 0)

In [None]:
vehicles_df_to_learn2 = vehicles_df_to_learn2.reset_index()
vehicles_df_to_learn2.head()

Here is the second "Pythonic" way:

In [None]:
dum = pd.get_dummies(vehicles_df_to_learn2['title_status']).reset_index()

In [None]:
dum.head()

In [None]:
vehicles_df_to_learn2 = pd.merge(vehicles_df_to_learn2, dum, on='index')
vehicles_df_to_learn2 = vehicles_df_to_learn2.drop(columns=['index', 'transmission', 'title_status'])

In [None]:
vehicles_df_to_learn2.head()

In [None]:
vehicles_df_train2, vehicles_df_test2 = split(vehicles_df_to_learn2, train_size=0.6, random_state=4222)
X_train2 = vehicles_df_train2[['odometer','year', 'transmission_automatic', 'transmission_manual', 'transmission_other', 'clean', 'lien', 'missing', 'parts only', 'rebuilt', 'salvage']]
y_train2 = vehicles_df_train2['price']
cars_lm2 = LinearRegression(fit_intercept=True)
cars_lm2.fit(X_train2, y_train2)

In [None]:
print("The model intercept is: {}".format(cars_lm2.intercept_))
print("The model coefficients are: {}".format(cars_lm2.coef_[0]))
X_train2['Price_prediction'] = cars_lm2.predict(X_train2)
cars_train_rmse2 = np.sqrt(MSE(y_train2, X_train2['Price_prediction']))
print("RMSE = {:.2f}".format(cars_train_rmse2))

only ~ 0.1% less mistake than previously
lets do the same actions on the test data set

In [None]:
cars_lm_test2 = LinearRegression()
X_test2 = vehicles_df_test2[['odometer','year', 'transmission_automatic', 'transmission_manual', 'transmission_other', 'clean', 'lien', 'missing', 'parts only', 'rebuilt', 'salvage']]
y_test2 = vehicles_df_test2['price']
cars_lm_test2.fit(X_test2, y_test2)
X_test2['price_prediction'] = cars_lm_test2.predict(X_test2)
X_test2.head()
cars_test_rmse2 = np.sqrt(MSE(y_test2, X_test2['price_prediction']))
print("RMSE = {:.2f}".format(cars_test_rmse2))

A bit worth actually than the original test on the numerical datasets

Continuing with the simplicity - applying a quick KNN regression:

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import pairwise_distances
from sklearn import neighbors
from math import sqrt
from sklearn.metrics import mean_squared_error 

create train and test sets:

In [None]:
vehicles_df_knn_train, vehicles_df_knn_test = split(vehicles_df_to_learn, train_size=0.6, random_state=4222)
X_first = vehicles_df_knn_train.drop('price', axis=1)
y_first = vehicles_df_knn_train['price']

X_second = vehicles_df_knn_test.drop('price', axis=1)
y_second = vehicles_df_knn_test['price']

Preprocessing – Scaling the features

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

X_first_scaled = scaler.fit_transform(X_first)
X_first = pd.DataFrame(X_first_scaled)

X_second_scaled = scaler.fit_transform(X_second)
X_second = pd.DataFrame(X_second_scaled)

 Let us have a look at the error rate for different k values

In [None]:
rmse_val2 = [] #to store rmse values for different k
for K in range(20):
    K += 1
    model = neighbors.KNeighborsRegressor(n_neighbors = K)

    model.fit(X_first, y_first)  #fit the model
    pred=model.predict(X_second) #make prediction on test set
    error = sqrt(mean_squared_error(y_second, pred)) #calculate rmse
    rmse_val2.append(error) #store rmse values
    print('RMSE value for k= ' , K , 'is:', error)

In [None]:
#plotting the rmse values against k values
curve = pd.DataFrame(rmse_val2) #elbow curve 
curve.plot()

# we can  see K neighbours 4-7 being the best predictor in terms of error
The rmse is also significantly lower than the simple linear regression

Lets try and run the same model on additional categorical parameters:

In [None]:
vehicles_df_to_learn2.head()

In [None]:
vehicles_df_knn_train2, vehicles_df_knn_test2 = split(vehicles_df_to_learn2, train_size=0.6, random_state=4222)
X_first2 = vehicles_df_knn_train2.drop('price', axis=1)
y_first2 = vehicles_df_knn_train2['price']

X_second2 = vehicles_df_knn_test2.drop('price', axis=1)
y_second2 = vehicles_df_knn_test2['price']

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))

X_first_scaled2 = scaler.fit_transform(X_first2)
X_first2 = pd.DataFrame(X_first_scaled2)

X_second_scaled2 = scaler.fit_transform(X_second2)
X_second2 = pd.DataFrame(X_second_scaled2)

In [None]:
rmse_val3 = [] 
K = 2
for i in range(5):
    K += 1
    model2 = neighbors.KNeighborsRegressor(n_neighbors = K)
    model2.fit(X_first2, y_first2)  
    pred2=model2.predict(X_second2) 
    error2 = sqrt(mean_squared_error(y_second2, pred2)) 
    rmse_val3.append(error2) 
    print('RMSE value for k= ' , K , 'is:', error2)

We still see K neihbours 4-6 being the best predictors, however adding the features actually created a larger mistake

# The end for now.. Next steps would be adding more features and checking RMSE. Also run additional regression models to create a better prediction.