In [None]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [None]:
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python-Levenshtein-0.12.2.tar.gz (50 kB)
[?25l[K     |██████▌                         | 10 kB 23.8 MB/s eta 0:00:01[K     |█████████████                   | 20 kB 28.3 MB/s eta 0:00:01[K     |███████████████████▌            | 30 kB 32.6 MB/s eta 0:00:01[K     |██████████████████████████      | 40 kB 35.8 MB/s eta 0:00:01[K     |████████████████████████████████| 50 kB 6.2 MB/s 
Building wheels for collected packages: python-Levenshtein
  Building wheel for python-Levenshtein (setup.py) ... [?25l[?25hdone
  Created wheel for python-Levenshtein: filename=python_Levenshtein-0.12.2-cp37-cp37m-linux_x86_64.whl size=149858 sha256=61b6ad13238ec2d09e3667582bcedb843c4bf783ac9cfdcab95a8b29bb03096e
  Stored in directory: /root/.cache/pip/wheels/05/5f/ca/7c4367734892581bb5ff896f15027a932c551080b2abd3e00d
Successfully built python-Levenshtein
Installing collected packages: python-Levenshtein
Successfully installed python-Levenshtein-0.12.2


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Import**

In [None]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from fuzzywuzzy import fuzz
from sklearn.preprocessing import OneHotEncoder
#Model

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb


#Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Project_for_EMSE6574/cars_2021_clean(price<104000).csv')

In [None]:
df.drop('Unnamed: 0',axis=1,inplace=True)
#reduce the scale of odometer to prevent from dominating the prediction model
df["odometer"] = np.sqrt(preprocessing.minmax_scale(df["odometer"]))
df.head()

Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state
0,33590,2014,gmc,sierra 1500 crew cab slt,good,8 cylinders,gas,0.380536,clean,other,UNKNOWN,UNKNOWN,pickup,white,al
1,22590,2010,chevrolet,silverado 1500,good,8 cylinders,gas,0.421987,clean,other,UNKNOWN,UNKNOWN,pickup,blue,al
2,39590,2020,chevrolet,silverado 1500 crew,good,8 cylinders,gas,0.218861,clean,other,UNKNOWN,UNKNOWN,pickup,red,al
3,30990,2017,toyota,tundra double cab sr,good,8 cylinders,gas,0.32064,clean,other,UNKNOWN,UNKNOWN,pickup,red,al
4,15000,2013,ford,f-150 xlt,excellent,6 cylinders,gas,0.565686,clean,automatic,rwd,full-size,truck,black,al


# OneHot Encode

Most of the features are categorical, we need to encode them. We use OneHot Encode. But for 'model' feature, there are too many values, which means there will be too many dummy variables after encoding. So we need to classify values in 'model' variable to decrease number of dummy variables.

**Fuzzy String Matching**

Fuzzy matching is to approximately match strings and determine how similar they are. We want to use fuzzy matching to classify values in 'model' variable. 

The classification is based on...

We have tried three choices of popular model names: top 30, top 50, top 30 model names add all manufacturer names.

...

The result is top 30 model names add all manufacturer names got the best prediction accuracy. 

In [None]:
# create a new column contains both manufacturer and model
df['manu_model'] = df['manufacturer'] + ' ' + df['model']
df.head()

Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state,manu_model
0,33590,2014,gmc,sierra 1500 crew cab slt,good,8 cylinders,gas,0.380536,clean,other,UNKNOWN,UNKNOWN,pickup,white,al,gmc sierra 1500 crew cab slt
1,22590,2010,chevrolet,silverado 1500,good,8 cylinders,gas,0.421987,clean,other,UNKNOWN,UNKNOWN,pickup,blue,al,chevrolet silverado 1500
2,39590,2020,chevrolet,silverado 1500 crew,good,8 cylinders,gas,0.218861,clean,other,UNKNOWN,UNKNOWN,pickup,red,al,chevrolet silverado 1500 crew
3,30990,2017,toyota,tundra double cab sr,good,8 cylinders,gas,0.32064,clean,other,UNKNOWN,UNKNOWN,pickup,red,al,toyota tundra double cab sr
4,15000,2013,ford,f-150 xlt,excellent,6 cylinders,gas,0.565686,clean,automatic,rwd,full-size,truck,black,al,ford f-150 xlt


In [None]:
# grab the top 30 models
from collections import Counter

counter = Counter(df['manu_model'])
popular_model = counter.most_common()[:30]
temp_popular_model_names = []
for m in popular_model:
    temp_popular_model_names.append(m[0])

# add all manufacturers and top 30 popular names together 
popular_model_names = temp_popular_model_names + list(df['manufacturer'].unique())

# put all the other model value in a new Series
other = df[~df['manu_model'].isin(popular_model_names) ]['manu_model']


partial_ratio is better.

there will be some models fit in more than 1 popular_model_names, for example 5 100

In [None]:
def match_popular(word, name_list):
    '''
    This function is to make all the values in other to fuzzy match popular model names.
    INPUT:
    word: values in 'other' column
    name_list: values in 'popular_model_names'
    OUTPUT:
    
    '''
    max_score = -99999
    max_name = word
    for name in name_list:
        temp_score = fuzz.partial_ratio(word, name)
        if temp_score > max_score:
            max_score = temp_score
            max_name = name
    if max_score<=50:
        return 'other', max_score
    return max_name, max_score

In [None]:
df['model_class'] = df['manu_model'].apply(lambda x: match_popular(x, popular_model_names)[0])

In [None]:
df.head()

Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state,manu_model,model_class
0,33590,2014,gmc,sierra 1500 crew cab slt,good,8 cylinders,gas,0.380536,clean,other,UNKNOWN,UNKNOWN,pickup,white,al,gmc sierra 1500 crew cab slt,gmc
1,22590,2010,chevrolet,silverado 1500,good,8 cylinders,gas,0.421987,clean,other,UNKNOWN,UNKNOWN,pickup,blue,al,chevrolet silverado 1500,chevrolet silverado 1500
2,39590,2020,chevrolet,silverado 1500 crew,good,8 cylinders,gas,0.218861,clean,other,UNKNOWN,UNKNOWN,pickup,red,al,chevrolet silverado 1500 crew,chevrolet silverado 1500
3,30990,2017,toyota,tundra double cab sr,good,8 cylinders,gas,0.32064,clean,other,UNKNOWN,UNKNOWN,pickup,red,al,toyota tundra double cab sr,toyota
4,15000,2013,ford,f-150 xlt,excellent,6 cylinders,gas,0.565686,clean,automatic,rwd,full-size,truck,black,al,ford f-150 xlt,ford f-150


'fuzz': temp_model_name=30, number of other is 73347, nunique:31

'fuzz2': temp_model_name=50, number of other is 60288, nunique:51

'fuzz3': popular_model_name= temp_model_name(30) + all manufacture name, number of other is 0, nunique:70

**OneHot Encode**

In [None]:
df.columns

Index(['price', 'year', 'manufacturer', 'model', 'condition', 'cylinders',
       'fuel', 'odometer', 'title_status', 'transmission', 'drive', 'size',
       'type', 'paint_color', 'state', 'manu_model', 'model_class'],
      dtype='object')

In [None]:
df1 = pd.get_dummies(df, columns=['condition', 'cylinders',
       'fuel','title_status', 'transmission', 'drive', 'size',
       'type', 'paint_color', 'state', 'model_class'], drop_first=True)

**Train Test Split**

In [None]:
df1.columns

Index(['price', 'year', 'manufacturer', 'model', 'odometer', 'manu_model',
       'condition_excellent', 'condition_fair', 'condition_good',
       'condition_like new',
       ...
       'model_class_subaru outback', 'model_class_tesla', 'model_class_toyota',
       'model_class_toyota camry', 'model_class_toyota corolla',
       'model_class_toyota prius', 'model_class_toyota rav4',
       'model_class_toyota tacoma', 'model_class_volkswagen',
       'model_class_volvo'],
      dtype='object', length=185)

In [None]:
X = df1.drop(['price','manufacturer', 'model', 'manu_model'],axis=1)
y = df1['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=666)
#view number of training and testing data
print('Our training prediction variable contains :',len(y_train) ,'rows')
print('Our training independent variable contains :',len(X_train) ,'rows')
print('Our testing prediction variable contains :',len(y_test) ,'rows')
print('Our testing independent variable contains :',len(X_test) ,'rows')

Our training prediction variable contains : 174174 rows
Our training independent variable contains : 174174 rows
Our testing prediction variable contains : 85788 rows
Our testing independent variable contains : 85788 rows


**Define a function for utput statistcs**

In [None]:
def reg_metrics(prediction_model, X_train, X_test, y_train, y_test):
    pred = prediction_model.predict(X_test)
    train_pred = prediction_model.predict(X_train)
    RMSE_train = np.sqrt(metrics.mean_squared_error(y_train, train_pred))
    RMSE_test =  np.sqrt(metrics.mean_squared_error(y_test, pred))
    R2_train = np.round(prediction_model.score(X_train, y_train),4)
    R2_test = np.round(prediction_model.score(X_test, y_test),4)
    print('RMSE on train data:', RMSE_train)
    print('RMSE on test data:', RMSE_test)
    print('R-square on train data:', R2_train)
    print('R-square on test data:', R2_test)
    return (RMSE_train,RMSE_test, R2_train, R2_test)

In [None]:
def reg_metrics_0(prediction_model, X_train, X_test, y_train, y_test):
    pred = prediction_model.predict(X_test)
    train_pred = prediction_model.predict(X_train)
    pred_0 = pred*(pred>0)  # if pred less than 0, let pred=0, that will be more realistic
    train_pred_0 = train_pred*(train_pred>0)
    RMSE_train = np.sqrt(metrics.mean_squared_error(y_train, train_pred_0))
    RMSE_test =  np.sqrt(metrics.mean_squared_error(y_test, pred_0))
    R2_train = np.round(prediction_model.score(X_train, y_train),4)
    R2_test = np.round(prediction_model.score(X_test, y_test),4)
    print('RMSE on train data:', RMSE_train)
    print('RMSE on test data:', RMSE_test)
    print('R-square on train data:', R2_train)
    print('R-square on test data:', R2_test)
    return (RMSE_train,RMSE_test, R2_train, R2_test)

In [None]:
# Create a dataframe to summarize accuracies
prediction_model_list = ['Linear Regression','Log Data Linear Regression','Decision Tree','Random Forest Regression','XGBoost Regression']
acc_cols = ['RMSE_train','RMSE_test','R2_train','R2_test']
accuracy = pd.DataFrame(columns=acc_cols, index=prediction_model_list)


# **Model**

## **Linear Model- Baseline Model**

 **Log Data**

In [None]:
df1['price_log'] = np.log(df1['price'])


In [None]:
y_log = df1['price_log']
X_train, X_test, y_log_train, y_log_test = train_test_split(X,y_log,test_size=0.33,random_state=666)


## **Random Forest Regression**



In [None]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf_acc = reg_metrics(rf, X_train, X_test, y_train, y_test)
accuracy.loc['Random Forest Regression'] = rf_acc

RMSE on train data: 2146.940833331976
RMSE on test data: 5758.57865323411
R-square on train data: 0.9769
R-square on test data: 0.8343


In [None]:
rfprediction = rf.predict(X_test)
rfdf = pd.DataFrame(rfprediction, columns = ['predict price'])
rfdf.head()

Unnamed: 0,predict price
0,34991.1
1,8190.04
2,53257.08
3,10580.25
4,1456.03




---








---









---



### With the prediction of Random Forest Regression, we want to judge whether the deal is good or bad by comparing the price to the predicted price.


## **Good/Bad Deal**


In [None]:
def Good_Bad_Deal(model_name):
    rfdf = pd.DataFrame(rfprediction, columns = ['predict price'],index=X_test.index)
    df2 = pd.concat([X_test, rfdf], axis=1)
    df3 = df2.loc[df2[model_name] == 1]
    df4 = df3[[model_name,'predict price']]
    df5 = df4.join(df)
    comparison_column = np.where(df5["predict price"] > df5["price"], 'Good Deal', 'Bad Deal')
    df6 = pd.DataFrame(comparison_column, columns=['Good/Bad Deal'], index=df5.index)
    df6 = df5.join(df6)
    df6 = df6[df6['price'] > 1000]
    return df6

In [None]:
Good_Bad_Deal('model_class_kia')

Unnamed: 0,model_class_kia,predict price,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state,Good/Bad Deal
66293,1,10986.750000,8999,2014,kia,soul basecrossover 6a,excellent,4 cylinders,gas,41725.0,UNKNOWN,automatic,UNKNOWN,UNKNOWN,wagon,UNKNOWN,fl,Good Deal
78941,1,6308.870000,3200,2008,kia,rio,excellent,UNKNOWN,gas,128900.0,clean,automatic,fwd,UNKNOWN,sedan,UNKNOWN,id,Good Deal
105449,1,17339.360000,15984,2017,kia,sportage,UNKNOWN,4 cylinders,gas,80994.0,clean,automatic,4wd,UNKNOWN,SUV,white,ky,Good Deal
172858,1,8308.340000,6500,2011,kia,optima lx,good,UNKNOWN,gas,102550.0,clean,automatic,fwd,UNKNOWN,UNKNOWN,UNKNOWN,ny,Good Deal
196237,1,9503.340000,9251,2013,kia,soul,UNKNOWN,4 cylinders,gas,117124.0,clean,automatic,fwd,UNKNOWN,hatchback,UNKNOWN,or,Good Deal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22519,1,10037.560000,12988,2015,kia,optima hybrid,good,4 cylinders,electric,110110.0,clean,automatic,fwd,UNKNOWN,sedan,white,ca,Bad Deal
25172,1,5086.034286,3900,2009,kia,spectra ex,excellent,4 cylinders,gas,150996.0,clean,automatic,fwd,UNKNOWN,UNKNOWN,silver,ca,Good Deal
84923,1,35762.050000,36590,2020,kia,stinger gt sedan 4d,good,UNKNOWN,other,3137.0,clean,other,UNKNOWN,UNKNOWN,sedan,red,il,Bad Deal
216165,1,11984.234762,10999,2014,kia,sorento,UNKNOWN,UNKNOWN,gas,83516.0,clean,automatic,fwd,UNKNOWN,wagon,UNKNOWN,tn,Good Deal


In [None]:
Good_Bad_Deal('model_class_jeep wrangler')

Unnamed: 0,model_class_jeep wrangler,predict price,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state,Good/Bad Deal
245634,1,35522.05,36683,2017,jeep,wrangler unlimited sport,UNKNOWN,UNKNOWN,gas,50612.0,clean,automatic,4wd,UNKNOWN,SUV,grey,wa,Bad Deal
215405,1,31712.00,31590,2017,jeep,wrangler unlimited sport,good,6 cylinders,gas,30577.0,clean,other,4wd,UNKNOWN,other,UNKNOWN,tn,Good Deal
22052,1,40517.00,39990,2018,jeep,wrangler unlimited all new,good,6 cylinders,other,15688.0,clean,other,4wd,UNKNOWN,other,silver,ca,Good Deal
44404,1,10061.43,10800,2002,jeep,wrangler,UNKNOWN,6 cylinders,gas,158000.0,clean,manual,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,co,Bad Deal
185489,1,20775.28,9500,1995,jeep,wrangler,like new,UNKNOWN,gas,17000.0,clean,automatic,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,oh,Good Deal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128615,1,7138.37,8500,1997,jeep,wrangler,good,4 cylinders,gas,189065.0,clean,automatic,4wd,UNKNOWN,wagon,red,mi,Bad Deal
233021,1,28287.92,39998,2014,jeep,wrangler unlimited rubicon,excellent,6 cylinders,gas,71200.0,clean,manual,4wd,full-size,SUV,black,tx,Bad Deal
240005,1,9571.15,6300,1995,jeep,wrangler,excellent,4 cylinders,gas,204000.0,clean,manual,4wd,UNKNOWN,UNKNOWN,white,va,Good Deal
18109,1,11844.95,12900,1998,jeep,wrangler,excellent,4 cylinders,gas,79300.0,clean,manual,4wd,UNKNOWN,UNKNOWN,white,ca,Bad Deal


## **To see the price is how much cheaper than predicted price and find a Best_Car**


In [None]:
def Best_Car(model_name):
    df = pd.read_csv('/content/drive/MyDrive/Project_for_EMSE6574/cars_2021_clean(price<104000).csv')
    df.drop('Unnamed: 0',axis=1,inplace=True)
    rfdf = pd.DataFrame(rfprediction, columns = ['predict price'],index=X_test.index)
    df2 = pd.concat([X_test, rfdf], axis=1)
    df3 = df2.loc[df2[model_name] == 1]
    df4 = df3[[model_name,'predict price']]
    df5 = df4.join(df)
    comparison_column = np.where(df5["predict price"] > df5["price"], 'Good Deal', 'Bad Deal')
    df6 = pd.DataFrame(comparison_column, columns=['Good/Bad Deal'], index=df5.index)
    df6 = df5.join(df6)
    df6 = df6[df6['price'] > 1000]
    df7 = pd.DataFrame((df6['predict price'] - df6['price'])/df6['predict price'],columns=['Cheaper Than Predict'])
    df7.style.format({
    'Cheaper Than Predict': '{:,.2%}'.format,
})
    df7 = df7.sort_values(['Cheaper Than Predict'])
    df7['Cheaper Than Predict'] = df7['Cheaper Than Predict'].astype(float).map("{:.2%}".format)
    df = pd.read_csv('/content/drive/MyDrive/Project_for_EMSE6574/cars_2021_clean(price<104000).csv')
    df.drop('Unnamed: 0',axis=1,inplace=True)
    Best_Car = df7.iloc[-1:]
    return df.loc[Best_Car.index]


In [None]:
Best_Car('model_class_lexus')

Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state
66877,1200,2013,lexus,es 350,excellent,6 cylinders,gas,92699.0,clean,automatic,fwd,UNKNOWN,sedan,UNKNOWN,fl


In [None]:
Best_Car('model_class_jeep wrangler')

Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state
5653,2000,2015,jeep,wrangler jku,good,UNKNOWN,gas,1.0,clean,other,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,az
