In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os
print(os.listdir("../input"))

In [None]:
sns.set(rc={'figure.figsize': (11.7, 8.27)})
sns.set_style('whitegrid')

In [None]:
data = pd.read_csv('../input/Reviews.csv')

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.head()

In [None]:
data = data.drop(['ProfileName', 'Time'], axis=1) # definitely dont need this

In [None]:
data['Helpfulness'] = data['HelpfulnessNumerator'] / (data['HelpfulnessNumerator'] + data['HelpfulnessDenominator'])
data['Helpfulness'].fillna(0, inplace=True)

In [None]:
data = data.drop(['HelpfulnessNumerator', 'HelpfulnessDenominator'], axis=1)
data.info()

In [None]:
# for item based filtering
item_based = data.drop(['UserId'], axis=1)
item_based.head()

## Preprocessing

In [None]:
item_based['Remark'] = item_based['Summary'] + ' ' + item_based['Text']
item_based.drop(['Summary', 'Text'], axis=1, inplace=True)

In [None]:
item_based.head()

Converting the `Remark` feature into a sentiment polarity. Due to the diversity of words and context observed in the column. Too much sparsity may not give good results.

In [None]:
from textblob import TextBlob 
import swifter

In [None]:
def to_sentiment(text):
    sen = TextBlob(text).sentiment.polarity
    return sen

In [None]:
item_based['Remark'].fillna(' ', inplace=True)

In [None]:
item_based['Sentiment'] = item_based['Remark'].swifter.apply(to_sentiment)

In [None]:
item_based.drop(['Remark'], axis=1, inplace=True)

In [None]:
item_based.describe()

In [None]:
item_based.to_csv('item_based.csv')

Guess that's enough feature selection. Let's explore our features and fine tune it

In [None]:
plt.figure(figsize=(16,9))
sns.kdeplot(item_based.groupby(['ProductId'])['Score'].mean(), shade=True, color='grey')

In [None]:
unreliability = item_based.groupby(['ProductId'])['Score'].std(ddof=-1)

In [None]:
sns.kdeplot(unreliability, shade=True, color='grey')

Final list of features

In [None]:
product_rating = item_based.groupby('ProductId')['Score'].mean()
product_helpful = item_based.groupby('ProductId')['Helpfulness'].mean()
product_sentiment = item_based.groupby('ProductId')['Sentiment'].mean()
product_rating_count = item_based.groupby('ProductId')['Score'].count()
products = item_based.ProductId.unique()

In [None]:
data_model = pd.DataFrame({
    'Score': product_rating[products],
    'Count': product_rating_count[products],
    'Unreliability': unreliability[products],
    'Sentiment': product_sentiment[products],
    'Helpfulness': product_helpful[products]
})
data_model.head()

In [None]:
data_model.describe()

In [None]:
def normalize(values):
    mn = values.min()
    mx = values.max()
    return(10.0/(mx - mn) * (values - mx)+10)

In [None]:
data_model = normalize(data_model)
data_model.head()

## Modelling the recommender

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
engine = KNeighborsClassifier(n_neighbors=20)

data_points = data_model.values
labels = data_model.index.values

print('Data: ', data_points)
print('Labels: ', labels)

engine.fit(data_points, labels)

Let's take the bad boy for a spin

In [None]:
product_id = 'B0098WV8F2'
product_data = data_model.loc[product_id].values

recommended_products = engine.kneighbors(X=[product_data], n_neighbors=20, return_distance=False)

products_list = []
for prod in recommended_products:
    products_list.append(data_model.iloc[prod].index)
    
print('Recommended Products: ')
print(products_list)

In [None]:
ax = data_model.plot(kind='scatter', x='Score', y='Count', color='grey', alpha=0.20)
data_model.iloc[recommended_products[0]].plot(kind='scatter', x='Score', y='Count', color='orange', alpha=0.50, ax=ax)

ax2 = data_model.plot(kind='scatter', x='Score', y='Unreliability', color='grey', alpha=0.20)
data_model.iloc[recommended_products[0]].plot(kind='scatter', x='Score', y='Unreliability', color='orange', alpha=0.50, ax=ax2)

ax2 = data_model.plot(kind='scatter', x='Score', y='Sentiment', color='grey', alpha=0.20)
data_model.iloc[recommended_products[0]].plot(kind='scatter', x='Score', y='Sentiment', color='orange', alpha=0.50, ax=ax2)

ax2 = data_model.plot(kind='scatter', x='Score', y='Helpfulness', color='grey', alpha=0.20)
data_model.iloc[recommended_products[0]].plot(kind='scatter', x='Score', y='Helpfulness', color='orange', alpha=0.50, ax=ax2)