### RECOMMENDATION SYSTEM

Now that the data has been cleaned, the recommender system can be engaged. This will be a **contenet-based recommender system**  where recommendations are based on the similarity of the vehicle features and not on explicit (user preference or profile) user metadata.

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# load the processed dataset 'vehicles2_rec.csv'
df = pd.read_csv('vehicles2_rec.csv')
df.head()

Unnamed: 0,region,price,year,manufacturer,model,fuel,odometer,title_status,transmission,condition,...,state,image_url,posting_date,postmd,Age,avg_mil,mil_rating,Made,type_group,color_group
0,bakersfield,11975,2003,harley-davidson,davidson,gas,207000.0,clean,automatic,excellent,...,ca,https://images.craigslist.org/00F0F_eEBhgvA15M...,2021-05-04T12:26:43-0700,May-04,22,9409.090909,below average,American,non-luxury_small,dark color
1,bakersfield,21950,2003,chevrolet,silverado 3500,diesel,202537.0,clean,automatic,excellent,...,ca,https://images.craigslist.org/00m0m_9owDGrPLWK...,2021-05-04T12:04:51-0700,May-04,22,9206.227273,below average,American,non-luxury_small,dark color
2,bakersfield,15590,2015,mini,hardtop 2 door cooper s,other,60291.0,clean,other,good,...,ca,https://images.craigslist.org/00f0f_dGlIKGTKrJ...,2021-05-04T11:41:15-0700,May-04,10,6029.1,below average,English,luxury_small,dark color
3,bakersfield,3975,2006,pontiac,g6,gas,187000.0,clean,automatic,excellent,...,ca,https://images.craigslist.org/00V0V_5uw5gVZjSk...,2021-05-04T11:35:04-0700,May-04,19,9842.105263,below average,American,luxury_small,light color
4,bakersfield,8000,1972,chevrolet,chevelle,gas,61000.0,clean,automatic,fair,...,ca,https://images.craigslist.org/00U0U_gvATaIvxTn...,2021-05-04T11:30:14-0700,May-04,53,1150.943396,below average,American,luxury_small,light color


In [19]:
def recommend(made, color_group, type_group, price_range, transmission):
    # Matching the type with the dataset and reset the index
    data = df.loc[(df['color_group']==color_group) 
                  & (df['type_group']==type_group) 
                  & ((df['price']>=price_range[0]) & (df['price']<=price_range[1]))
                  & (df['transmission'] == transmission)]  
    data.reset_index(level=0, inplace=True)
  
    # Convert the index into series
    indices = pd.Series(data.index, index=data['Made'])
    
    # Converting the car manufacturer country into vectors and used unigram
    tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), min_df=1, stop_words='english')
    tfidf_matrix = tf.fit_transform(data['Made'])
    
    # Calculating the similarity measures based on Cosine Similarity
    sg = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Get the index corresponding to original_manufacturer
    idx = indices[made]
    
    # Get the pairwise similarity scores
    sig = list(enumerate(sg[idx]))
    
    # Sort the cars
    sig = sorted(sig, reverse=True)
    
    # Scores of the 6 most similar cars 
    sig = sig[0:6]
    
    # Car indices
    car_indices = [i[0] for i in sig]
   
    # Top 6 car recommendations
    rec = data[['price', 'Made', 'manufacturer', 'model', 'type', 'year', 'Age', 'condition', 'fuel', 
                'title_status', 'transmission', 'paint_color', 'mil_rating', 'state', 'drive']].iloc[car_indices]
    
    return data

In [20]:
data = recommend("American", "light color", "luxury_small", (5000, 10000), "automatic")
data

Unnamed: 0,index,region,price,year,manufacturer,model,fuel,odometer,title_status,transmission,...,state,image_url,posting_date,postmd,Age,avg_mil,mil_rating,Made,type_group,color_group
0,4,bakersfield,8000,1972,chevrolet,chevelle,gas,61000.0,clean,automatic,...,ca,https://images.craigslist.org/00U0U_gvATaIvxTn...,2021-05-04T11:30:14-0700,May-04,53,1150.943396,below average,American,luxury_small,light color
1,7,bakersfield,5995,2011,nissan,leaf,electric,52396.0,clean,automatic,...,ca,https://images.craigslist.org/00n0n_k8tUiku4S4...,2021-05-04T10:15:19-0700,May-04,14,3742.571429,below average,Japanese,luxury_small,light color
2,21,bakersfield,6600,2008,honda,accord,gas,127472.0,clean,automatic,...,ca,https://images.craigslist.org/00s0s_3IhUCab3b7...,2021-05-03T18:37:17-0700,May-04,17,7498.352941,below average,Japanese,luxury_small,light color
3,35,bakersfield,8500,2008,mercedes-benz,benz e350 sport,gas,128000.0,clean,automatic,...,ca,https://images.craigslist.org/00303_en6H0EIJGO...,2021-05-02T15:40:47-0700,May-02,17,7529.411765,below average,German,luxury_small,light color
4,47,bakersfield,9597,2014,kia,optima,gas,145789.0,clean,automatic,...,ca,https://images.craigslist.org/00Y0Y_bOHA3xGiRi...,2021-05-01T20:00:58-0700,May-02,11,13253.545455,below average,Korean,luxury_small,light color
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5870,103748,wausau,9450,2015,chevrolet,impala lt,gas,99611.0,clean,automatic,...,wi,https://images.craigslist.org/00g0g_5ifP39qBZ0...,2021-04-14T11:30:52-0500,April-14,10,9961.100000,below average,American,luxury_small,light color
5871,103766,wausau,7990,2012,chevrolet,cruze,gas,82505.0,clean,automatic,...,wi,https://images.craigslist.org/01010_i2D7BFj14b...,2021-04-14T10:30:45-0500,April-14,13,6346.538462,below average,American,luxury_small,light color
5872,103768,wausau,8990,2015,chevrolet,spark,gas,73133.0,clean,automatic,...,wi,https://images.craigslist.org/01313_34oE5AUahq...,2021-04-14T10:30:28-0500,April-14,10,7313.300000,below average,American,luxury_small,light color
5873,103792,wausau,6500,2010,lincoln,mkx,gas,160000.0,clean,automatic,...,wi,https://images.craigslist.org/00606_hF7SvHITgP...,2021-04-09T15:00:23-0500,April-09,15,10666.666667,below average,American,luxury_small,light color


In [13]:
df.manufacturer.value_counts().to_frame()

Unnamed: 0_level_0,count
manufacturer,Unnamed: 1_level_1
ford,18881
chevrolet,13445
toyota,10123
honda,6117
nissan,4823
jeep,4755
ram,4231
gmc,4104
bmw,3778
dodge,3306


In [12]:
df.manufacturer.value_counts().to_frame().index.tolist()

['ford',
 'chevrolet',
 'toyota',
 'honda',
 'nissan',
 'jeep',
 'ram',
 'gmc',
 'bmw',
 'dodge',
 'mercedes-benz',
 'subaru',
 'hyundai',
 'lexus',
 'volkswagen',
 'kia',
 'chrysler',
 'audi',
 'cadillac',
 'mazda',
 'acura',
 'infiniti',
 'buick',
 'lincoln',
 'volvo',
 'pontiac',
 'mitsubishi',
 'mini',
 'rover',
 'mercury',
 'saturn',
 'porsche',
 'jaguar',
 'fiat',
 'tesla',
 'alfa-romeo',
 'datsun',
 'harley-davidson',
 'land rover',
 'ferrari']

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), min_df=1, stop_words='english')
tfidf_matrix = tf.fit_transform(data['Made'])

from sklearn.metrics.pairwise import cosine_similarity
sg = cosine_similarity(tfidf_matrix, tfidf_matrix)
sg

array([[1., 0., 0., ..., 1., 1., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 1., 1., 0.],
       [1., 0., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])