# 1. Import and reading dataset

In [250]:
import warnings
warnings.filterwarnings('ignore')

In [251]:
import pandas as pd
import re
import string
from bs4 import BeautifulSoup

In [252]:
# from google.colab import drive
# drive.mount('/content/gdrive')

In [336]:
file_path = "https://raw.githubusercontent.com/Roger-Arnold/Sentiment_Classifier/refs/heads/main/sample30.csv"

In [254]:
df = pd.read_csv(file_path)

In [255]:
df['reviews_text'].head()

0    i love this album. it's very good. more to the...
1    Good flavor. This review was collected as part...
2                                         Good flavor.
3    I read through the reviews on here before look...
4    My husband bought this gel for us. The gel cau...
Name: reviews_text, dtype: object

# 2.Handling missing values

In [256]:
df.isna().sum().to_frame(name='Null_Values').sort_values(by='Null_Values', ascending=False)

Unnamed: 0,Null_Values
reviews_userProvince,29830
reviews_userCity,28071
reviews_didPurchase,14068
reviews_doRecommend,2570
reviews_title,190
manufacturer,141
reviews_username,63
reviews_date,46
user_sentiment,1
id,0


In [257]:
df[['id','reviews_text']][ df['user_sentiment'].isna() == True]

Unnamed: 0,id,reviews_text
28354,AVpfRTh1ilAPnD_xYic2,my kids absolutely loved this film so much tha...


In [258]:
df['user_sentiment'].fillna(value = 'Positive', inplace = True)

In [259]:
df['user_sentiment']=df.user_sentiment.map({'Positive':1 , 'Negative':0})

# 3. Text Processing

In [260]:
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags
    return text

In [261]:
df['reviews_text'] = df['reviews_text'].apply(clean_text)

In [262]:
df['reviews_text']

0        i love this album its very good more to the hi...
1        good flavor this review was collected as part ...
2                                              good flavor
3        i read through the reviews on here before look...
4        my husband bought this gel for us the gel caus...
                               ...                        
29995    i got this conditioner with influenster to try...
29996    i love it  i received this for review purposes...
29997    first of all i love the smell of this product ...
29998    i received this through influenster and will n...
29999    i received this product complimentary from inf...
Name: reviews_text, Length: 30000, dtype: object

# 4.Word Tokenization

In [263]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Roger
[nltk_data]     Arnold\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [264]:
df['reviews_text'] = df['reviews_text'].apply(word_tokenize)

In [265]:
df['reviews_text'].head()

0    [i, love, this, album, its, very, good, more, ...
1    [good, flavor, this, review, was, collected, a...
2                                       [good, flavor]
3    [i, read, through, the, reviews, on, here, bef...
4    [my, husband, bought, this, gel, for, us, the,...
Name: reviews_text, dtype: object

# 5. Removal of Stop Words

In [266]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to C:\Users\Roger
[nltk_data]     Arnold\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [267]:
def remove_stopwords(text):
    return [word for word in text if word not in stop_words]

In [268]:
df['reviews_text'] = df['reviews_text'].apply(remove_stopwords)

In [269]:
df['reviews_text']

0        [love, album, good, hip, hop, side, current, p...
1        [good, flavor, review, collected, part, promot...
2                                           [good, flavor]
3        [read, reviews, looking, buying, one, couples,...
4        [husband, bought, gel, us, gel, caused, irrita...
                               ...                        
29995    [got, conditioner, influenster, try, im, lovin...
29996    [love, received, review, purposes, influenster...
29997    [first, love, smell, product, wash, hair, smoo...
29998    [received, influenster, never, go, back, anyth...
29999    [received, product, complimentary, influenster...
Name: reviews_text, Length: 30000, dtype: object

In [270]:
pip install emoji --upgrade

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# 6. Removing emoji characters if any

In [271]:
import emoji
df['reviews_text'] = df['reviews_text'].apply(lambda x: [emoji.demojize(word) for word in x])

In [272]:
df['reviews_text'].head()

0    [love, album, good, hip, hop, side, current, p...
1    [good, flavor, review, collected, part, promot...
2                                       [good, flavor]
3    [read, reviews, looking, buying, one, couples,...
4    [husband, bought, gel, us, gel, caused, irrita...
Name: reviews_text, dtype: object

# 7. Lemmatizing the text data

In [273]:
# import nltk
# nltk.download()

In [274]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to C:\Users\Roger
[nltk_data]     Arnold\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [275]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Roger
[nltk_data]     Arnold\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [276]:
def word_lemmatizer(text):
    return [WordNetLemmatizer().lemmatize(word) for word in text]

In [277]:
df['reviews_text'] = df['reviews_text'].apply(word_lemmatizer)

In [278]:
df['reviews_text'].head()

0    [love, album, good, hip, hop, side, current, p...
1    [good, flavor, review, collected, part, promot...
2                                       [good, flavor]
3    [read, review, looking, buying, one, couple, l...
4    [husband, bought, gel, u, gel, caused, irritat...
Name: reviews_text, dtype: object

# 8. Joining all the tokens as a single text

In [279]:
def join_text(text):
    return " ".join(text)

df['reviews_text'] = df['reviews_text'].apply(join_text)

# 9. Train test split

In [280]:
from sklearn.model_selection import train_test_split

In [281]:
df['user_sentiment']

0        1
1        1
2        1
3        0
4        0
        ..
29995    1
29996    1
29997    1
29998    1
29999    1
Name: user_sentiment, Length: 30000, dtype: int64

In [282]:
X = df.reviews_text
y = df.user_sentiment

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=1, stratify=y )

# 10. Vectorization of words

In [283]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [284]:
bow_vectorizer = CountVectorizer(max_features=10000)
bow_vectorizer.fit(X_train)

# transform
bow_X_train = bow_vectorizer.transform(X_train)
bow_X_test = bow_vectorizer.transform(X_test)

In [285]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
tfidf_vectorizer.fit(X_train)

# transform
tfidf_X_train = tfidf_vectorizer.transform(X_train)
tfidf_X_test = tfidf_vectorizer.transform(X_test)

# 11. Trying out with different models

In [286]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score

### i. Generic function for training and evaluation

In [287]:
data = pd.DataFrame()

In [289]:
def train_and_eval(model, trainX, trainY, testX, testY):

    # training
    _ = model.fit(trainX, trainY)

    # predictions
    y_preds_train = model.predict(trainX)
    y_preds_test = model.predict(testX)
 
    Train_accuracy_score = round(accuracy_score(y_train,y_preds_train)*100,3)
    Test_accuracy_score =  round(accuracy_score(y_test,y_preds_test)*100,3)

    stats = {  "Model" :  model ,
               "Train_accuracy_score" : Train_accuracy_score ,
               "Test_accuracy_score": Test_accuracy_score }
    return stats

###  ii . Logistic Regression Model ( BOW, TF-IDF )

In [290]:
C = [0.001, 0.01, 0.1, 1, 10]

# print("Logistic Regression with bag of words vectors")
for c in C:
    # Define model
    log_model = LogisticRegression(C=c, max_iter=500, random_state=1)

    # Train and evaluate model
    new_data = train_and_eval(model=log_model,
                   trainX=bow_X_train,
                   trainY=y_train,
                   testX=bow_X_test,
                   testY=y_test)
    new_data["Vectorization"] = "BOW"
    new_data["Algorithm"] = "Logistic_Regression"
    data = data._append(new_data, ignore_index=True)

# print("\nLogistic Regression with TF-IDF vectors")

for c in C:
    # Define model
    log_model = LogisticRegression(C=c, max_iter=500, random_state=1)

    # Train and evaluate model
    new_data = train_and_eval(model=log_model,
                   trainX=tfidf_X_train,
                   trainY=y_train,
                   testX=tfidf_X_test,
                   testY=y_test)
    new_data["Vectorization"] = "TF-IDF"
    new_data["Algorithm"] = "Logistic_Regression"
    data = data._append(new_data, ignore_index=True)


###  iii . Multinomial Naive Bayes Model ( BOW, TF-IDF )

In [291]:
alphas = [0, 0.2, 0.6, 0.8, 1]

for a  in alphas:
    # Define model
    nb_model = MultinomialNB(alpha=a)

    # Train and evaluate model
    new_data = train_and_eval(model=nb_model,
                   trainX=bow_X_train,
                   trainY=y_train,
                   testX=bow_X_test,
                   testY=y_test)
    new_data["Vectorization"] = "BOW"
    new_data["Algorithm"] = "MultinomialNB"
    data = data._append(new_data, ignore_index=True)

for a  in alphas:
    # Define model
    nb_model = MultinomialNB(alpha=a)

    # Train and evaluate model
    new_data = train_and_eval(model=nb_model,
                   trainX=tfidf_X_train,
                   trainY=y_train,
                   testX=tfidf_X_test,
                   testY=y_test)
    new_data["Vectorization"] = "TF-IDF"
    new_data["Algorithm"] = "MultinomialNB"
    data = data._append(new_data, ignore_index=True)


### iv . K - Nearest Neighbours Model ( BOW, TF-IDF )

In [292]:
neighbours = [5,10,15,20,25,30]

for n  in neighbours:
    # Define model
    knn_model = KNeighborsClassifier(n_neighbors=n)

    # Train and evaluate model
    new_data = train_and_eval(model=knn_model,
                   trainX=tfidf_X_train,
                   trainY=y_train,
                   testX=tfidf_X_test,
                   testY=y_test)
    new_data["Vectorization"] = "TF-IDF"
    new_data["Algorithm"] = "KNeighborsClassifier"
    data = data._append(new_data, ignore_index=True)

for n  in neighbours:
    # Define model
    knn_model = KNeighborsClassifier(n_neighbors=n)

    # Train and evaluate model
    new_data = train_and_eval(model=knn_model,
                   trainX=bow_X_train,
                   trainY=y_train,
                   testX=bow_X_test,
                   testY=y_test)
    new_data["Vectorization"] = "BOW" 
    new_data["Algorithm"] = "KNeighborsClassifier"
    data = data._append(new_data, ignore_index=True)

In [293]:
data.shape

(32, 5)

In [294]:
data["Score_difference"] = abs(data["Test_accuracy_score"] - data["Train_accuracy_score"])

In [295]:
data.sort_values(by="Test_accuracy_score", ascending=False,inplace=True)

# "Logistic Regression has the highest average test accuracy with very less overfitting"

In [296]:
data.head(5)

Unnamed: 0,Model,Train_accuracy_score,Test_accuracy_score,Vectorization,Algorithm,Score_difference
3,"LogisticRegression(C=1, max_iter=500, random_s...",98.121,95.3,BOW,Logistic_Regression,2.821
4,"LogisticRegression(C=10, max_iter=500, random_...",99.329,95.117,BOW,Logistic_Regression,4.212
9,"LogisticRegression(C=10, max_iter=500, random_...",98.4,94.767,TF-IDF,Logistic_Regression,3.633
2,"LogisticRegression(C=0.1, max_iter=500, random...",94.133,92.633,BOW,Logistic_Regression,1.5
8,"LogisticRegression(C=1, max_iter=500, random_s...",93.325,92.033,TF-IDF,Logistic_Regression,1.292


In [297]:
data.groupby(["Algorithm"])["Test_accuracy_score"].mean()

Algorithm
KNeighborsClassifier    78.9860
Logistic_Regression     91.4365
MultinomialNB           89.1449
Name: Test_accuracy_score, dtype: float64

In [393]:
best_model = data.head(1)

###  Best Model

In [394]:
best_model

Unnamed: 0,Model,Train_accuracy_score,Test_accuracy_score,Vectorization,Algorithm,Score_difference
3,"LogisticRegression(C=1, max_iter=500, random_s...",98.121,95.3,BOW,Logistic_Regression,2.821


# 12. Building Recommendation system

### Constructing Pivot table

In [298]:
# Copy the train dataset into dummy_train
dummy_train = df.copy()

In [299]:
# The products not rated by user is marked as 1 for prediction.
dummy_train['reviews_rating'] = dummy_train['reviews_rating'].apply(lambda x: 0 if x>=1 else 1)

In [300]:
# Convert the dummy train dataset into matrix format.
dummy_train = dummy_train.pivot_table(index=['reviews_username'], columns=['id'], values='reviews_rating')
dummy_train.fillna(1, inplace=True)

### Train and test split for reviews_rating

In [301]:
# Test and Train split of the dataset.
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.30, random_state=31)

In [302]:
print(train.shape)
print(test.shape)

(21000, 15)
(9000, 15)


In [303]:
df_pivot = train.pivot_table(index=['reviews_username'], columns=['id'], values='reviews_rating')
df_pivot.fillna(0, inplace=True)

In [304]:
df_pivot.shape

(18252, 252)

###  Creating dummy train & dummy test dataset

In [305]:
# Copy the train dataset into dummy_train
dummy_train = train.copy()

In [306]:
# The movies not rated by user is marked as 1 for prediction.
dummy_train['reviews_rating'] = dummy_train['reviews_rating'].apply(lambda x: 0 if x>=1 else 1)

In [307]:
# Convert the dummy train dataset into matrix format.
# dummy_train = dummy_train.pivot(
#     index='reviews_username',
#     columns='movieId',
#     values='rating'
# ).fillna(1)
dummy_train = dummy_train.pivot_table(index=['reviews_username'], columns=['id'], values='reviews_rating')
dummy_train.fillna(1, inplace=True)

In [308]:
dummy_train.head()

id,AV13O1A8GV-KLJ3akUyj,AV14LG0R-jtxr-f38QfS,AV16khLE-jtxr-f38VFn,AV1YGDqsGV-KLJ3adc-O,AV1YIch7GV-KLJ3addeG,AV1YlENIglJLPUi8IHsX,AV1YmBrdGV-KLJ3adewb,AV1YmDL9vKc47QAVgr7_,AV1Ymf_rglJLPUi8II2v,AV1Yn94nvKc47QAVgtst,...,AVpfr5cb1cnluZ0-pZFp,AVpfrFDZLJeJML43Bmv0,AVpfrTyiLJeJML43BrSI,AVpfrfHF1cnluZ0-pRai,AVpfrgjFLJeJML43BvCc,AVpfs0tUilAPnD_xgqN2,AVpfthSailAPnD_xg3ON,AVpftikC1cnluZ0-p31V,AVpfv4TlilAPnD_xhjNS,AVpfvieo1cnluZ0-qdnu
reviews_username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00dog3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
00sab00,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
01impala,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0325home,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
06stidriver,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Using Adjusted Cosine similarity

In [309]:
# Create a user-movie matrix.
df_pivot = train.pivot_table(index=['reviews_username'], columns=['id'], values='reviews_rating')

In [310]:
df_pivot.head()

id,AV13O1A8GV-KLJ3akUyj,AV14LG0R-jtxr-f38QfS,AV16khLE-jtxr-f38VFn,AV1YGDqsGV-KLJ3adc-O,AV1YIch7GV-KLJ3addeG,AV1YlENIglJLPUi8IHsX,AV1YmBrdGV-KLJ3adewb,AV1YmDL9vKc47QAVgr7_,AV1Ymf_rglJLPUi8II2v,AV1Yn94nvKc47QAVgtst,...,AVpfr5cb1cnluZ0-pZFp,AVpfrFDZLJeJML43Bmv0,AVpfrTyiLJeJML43BrSI,AVpfrfHF1cnluZ0-pRai,AVpfrgjFLJeJML43BvCc,AVpfs0tUilAPnD_xgqN2,AVpfthSailAPnD_xg3ON,AVpftikC1cnluZ0-p31V,AVpfv4TlilAPnD_xhjNS,AVpfvieo1cnluZ0-qdnu
reviews_username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00dog3,,,,,,,,,,,...,,,,,,,,,,
00sab00,,,,,,,,,,,...,,,,,,,,,,
01impala,,,,,,,,,,,...,,,,,,,,,,
0325home,,,,,,,,,,,...,,,,,,,,,,
06stidriver,,,,,,,,,,,...,,,,,,,,,,


### Normalising the rating of the movie for each user around 0 mean

In [311]:
mean = np.nanmean(df_pivot, axis=1)
df_subtracted = (df_pivot.T-mean).T

In [312]:
df_subtracted.head()

id,AV13O1A8GV-KLJ3akUyj,AV14LG0R-jtxr-f38QfS,AV16khLE-jtxr-f38VFn,AV1YGDqsGV-KLJ3adc-O,AV1YIch7GV-KLJ3addeG,AV1YlENIglJLPUi8IHsX,AV1YmBrdGV-KLJ3adewb,AV1YmDL9vKc47QAVgr7_,AV1Ymf_rglJLPUi8II2v,AV1Yn94nvKc47QAVgtst,...,AVpfr5cb1cnluZ0-pZFp,AVpfrFDZLJeJML43Bmv0,AVpfrTyiLJeJML43BrSI,AVpfrfHF1cnluZ0-pRai,AVpfrgjFLJeJML43BvCc,AVpfs0tUilAPnD_xgqN2,AVpfthSailAPnD_xg3ON,AVpftikC1cnluZ0-p31V,AVpfv4TlilAPnD_xhjNS,AVpfvieo1cnluZ0-qdnu
reviews_username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00dog3,,,,,,,,,,,...,,,,,,,,,,
00sab00,,,,,,,,,,,...,,,,,,,,,,
01impala,,,,,,,,,,,...,,,,,,,,,,
0325home,,,,,,,,,,,...,,,,,,,,,,
06stidriver,,,,,,,,,,,...,,,,,,,,,,


###  Finding Adjusted Cosine similarity

In [313]:
from sklearn.metrics.pairwise import pairwise_distances

In [314]:
# Creating the User Similarity Matrix using pairwise_distance function.
user_correlation = 1 - pairwise_distances(df_subtracted.fillna(0), metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


###  Prediction User-User

In [315]:
print(user_correlation)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [316]:
user_correlation[user_correlation<0]=0
user_correlation

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [317]:
user_predicted_ratings = np.dot(user_correlation, df_pivot.fillna(0))
user_predicted_ratings

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [318]:
user_predicted_ratings.shape

(18252, 252)

In [319]:
user_final_rating = np.multiply(user_predicted_ratings,dummy_train)
user_final_rating.head()

id,AV13O1A8GV-KLJ3akUyj,AV14LG0R-jtxr-f38QfS,AV16khLE-jtxr-f38VFn,AV1YGDqsGV-KLJ3adc-O,AV1YIch7GV-KLJ3addeG,AV1YlENIglJLPUi8IHsX,AV1YmBrdGV-KLJ3adewb,AV1YmDL9vKc47QAVgr7_,AV1Ymf_rglJLPUi8II2v,AV1Yn94nvKc47QAVgtst,...,AVpfr5cb1cnluZ0-pZFp,AVpfrFDZLJeJML43Bmv0,AVpfrTyiLJeJML43BrSI,AVpfrfHF1cnluZ0-pRai,AVpfrgjFLJeJML43BvCc,AVpfs0tUilAPnD_xgqN2,AVpfthSailAPnD_xg3ON,AVpftikC1cnluZ0-p31V,AVpfv4TlilAPnD_xhjNS,AVpfvieo1cnluZ0-qdnu
reviews_username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00dog3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00sab00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01impala,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0325home,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
06stidriver,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 13. Finding the top 20 recommendation for the *user*

In [320]:
# Take the user ID as input.
user_input = (input("Enter your user name\n"))

Enter your user name
joshua


In [321]:
d = user_final_rating.loc[user_input].sort_values(ascending=False)[0:20]

In [322]:
d = pd.DataFrame(d)

In [323]:
product_name_id = df.groupby(["id"])["name"].sum()

In [324]:
product_name_id = pd.DataFrame(product_name_id)

In [325]:
recommendation = d.merge(product_name_id, left_on='id', right_on='id')

In [326]:
d.shape

(20, 1)

In [327]:
recommendation

Unnamed: 0_level_0,joshua,name
id,Unnamed: 1_level_1,Unnamed: 2_level_1
AVpfPaoqLJeJML435Xk9,16.323714,Godzilla 3d Includes Digital Copy Ultraviolet ...
AVpfM_ytilAPnD_xXIJb,7.939935,Tostitos Bite Size Tortilla ChipsTostitos Bite...
AVpfPnrU1cnluZ0-g9rL,6.16002,Stargate (ws) (ultimate Edition) (director's C...
AVpfOIrkilAPnD_xXgDG,4.776709,Alex Cross (dvdvideo)Alex Cross (dvdvideo)Alex...
AVpe41TqilAPnD_xQH3d,4.478607,Mike Dave Need Wedding Dates (dvd + Digital)Mi...
AVpe59io1cnluZ0-ZgDU,3.752777,My Big Fat Greek Wedding 2 (blu-Ray + Dvd + Di...
AV1YGDqsGV-KLJ3adc-O,3.333333,Windex Original Glass Cleaner Refill 67.6oz (2...
AVpfrFDZLJeJML43Bmv0,3.273268,Meguiar's Ultimate Quik Detailer 22-Oz.Meguiar...
AVpe9W4D1cnluZ0-avf0,3.273268,Hoover174 Platinum Collection153 Lightweight B...
AVpf5olc1cnluZ0-tPrO,2.886751,Chester's Cheese Flavored Puffcorn SnacksChest...


# 14. Top 5 products with high percentage of positive reviews

In [497]:
id_values = [i for i in recommendation.index]

In [498]:
total_reviews={"id": [],"Count" :[]}

In [499]:
for i in id_values:
    total_reviews["id"].append(i)
    total_reviews["Count"].append(int(df["id"][ df["id"] == i].count()))

In [500]:
total_reviews = pd.DataFrame(total_reviews)

In [501]:
total_reviews = total_reviews.merge(recommendation,left_on='id',right_on='id')

In [503]:
positive_reviews = df[["id","user_sentiment"]]

In [504]:
positive_reviews = positive_reviews.groupby("id").sum()

In [506]:
positive_reviews = positive_reviews.merge(total_reviews,left_on='id',right_on='id')

In [508]:
positive_reviews["Positive_Percentage"] = round((positive_reviews["user_sentiment"]/positive_reviews["Count"])*100,3)

In [510]:
positive_reviews.sort_values(["Positive_Percentage"],ascending=False,inplace=True)

###  These are the top 5 recommended products which has the highest positive reviews

In [514]:
positive_reviews[["name","Positive_Percentage"]].head(5)

Unnamed: 0,name,Positive_Percentage
13,Stargate (ws) (ultimate Edition) (director's C...,96.237
4,My Big Fat Greek Wedding 2 (blu-Ray + Dvd + Di...,95.958
19,Meguiar's Ultimate Quik Detailer 22-Oz.Meguiar...,93.103
15,Planes: Fire Rescue (2 Discs) (includes Digita...,93.001
6,Hoover174 Platinum Collection153 Lightweight B...,91.436
