In [1]:
###################################################
# PROJECT: Rating Product & Sorting Reviews in Amazon
###################################################

###################################################
# Business Problem
###################################################

# One of the most important problems in e-commerce is to find the correct scores given to the products after sales.
# The solution to this problem is to provide more customer satisfaction for the e-commerce site, to stand out and buy products for the sellers
# means a complex shopping experience for those Another problem is the correct ordering of the comments given to the products.
# being kept under surveillance. Both financial loss as it will directly affect the sales of the product in which misleading comments stand out.
# as well as customer information will result. In the solution of these 2 basic problems, the e-commerce site and the sellers are browsing their sales.
# will complete the purchasing journey in a complex way.




In [2]:
###################################################
# Story of Dataset
###################################################


# This dataset, which includes Amazon product data, includes product categories and various metadata.
# The product with the most reviews in the electronics category has user ratings and reviews.

# Variables:
# reviewerID: User ID
# asin: Product ID
# reviewerName: User ID
# helpful: Degree of Beneficial Comment
# reviewText: Comment
# overall: Product Rating
# summary: Summary of Comment
# unixReviewTime: Comment Time
# reviewTime: Comment Time Raw
# day_diff: # of days passed from comment time
# helpful_yes: # of Beneficial Vote
# total_vote: Total # of vote

import pandas as pd
import math
import scipy.stats as st
from sklearn.preprocessing import MinMaxScaler

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 500)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [3]:
###################################################
# MISSION 1: Calculate Average Rating Based on Current Comments and Compare with Existing Average Rating.
###################################################


# In the shared data set, users gave points and comments to a product. Our aim in this task is to evaluate 
# the scores given by weighting them by date. It is necessary to compare the first average score with the weighted 
# score according to the date to be obtained.



###################################################
# Step 1: .Read Dataset ve Calculate Avarage Rating of Product
###################################################
df = pd.read_csv("/kaggle/input/d/dilekbarutu/amazon-review/amazon_review.csv")
df.head(20)
df.shape

df["overall"].mean()


###################################################
# Step 2: Calculate Time Based Weighted Avarage Rating of Product
###################################################

df["reviewTime"] = pd.to_datetime(df["reviewTime"])
current_date = df["reviewTime"].max()
df.head()
df["day_diff"] = (current_date - df["reviewTime"]).dt.days
df["day_diff"].describe().T

print(df.loc[(df["day_diff"] <= 280), "overall"].mean())
print(df.loc[(df["day_diff"] > 280) & (df["day_diff"] <= 430), "overall"].mean())
print(df.loc[(df["day_diff"] > 430) & (df["day_diff"] <= 600), "overall"].mean())
print(df.loc[(df["day_diff"] > 600), "overall"].mean())

# Function of Time Based Weighted Avarage Rating
def time_based_weighted_average(dataframe, w1=28, w2=26, w3=24, w4=22):
    return dataframe.loc[df["day_diff"] <= 280, "overall"].mean() * 28 / 100 + \
            dataframe.loc[(df["day_diff"] > 280) & (df["day_diff"] <= 430), "overall"].mean() * 26 / 100 + \
            dataframe.loc[(df["day_diff"] > 430) & (df["day_diff"] <= 600), "overall"].mean() * 24 / 100 + \
            dataframe.loc[(df["day_diff"] > 600), "overall"].mean() * 22 / 100
time_based_weighted_average(df)

time_based_weighted_average(df, 30, 26, 22, 22)

4.6957928802588995
4.636140637775961
4.571661237785016
4.4462540716612375


4.595593165128118

In [4]:
###################################################
# Mission 2: Specify 20 Reviews for the Product to be Displayed on the Product Detail Page
###################################################


###################################################
# Step 1. helpful_no Variable
###################################################

# Note:
# total_vote is the total number of up-downs given to a comment.
# up means helpful.
# There is no helpful_no variable in the data set, it must be generated over existing variables.

df["helpful_no"] = df["total_vote"] - df["helpful_yes"]
df.head(30)


###################################################
# Step 2. Calculate score_pos_neg_diff, score_average_rating and wilson_lower_bound Scores & Add These to Dataset
###################################################

# score_pos_neg_diff
def score_up_down_diff(up, down):
    return up - down
df["score_pos_neg_diff"] = df.apply(lambda x: score_up_down_diff(x["helpful_yes"],
                                                                             x["helpful_no"]), axis=1)

# score_average_rating
def score_average_rating(up, down):
    if up + down == 0:
        return 0
    return up / (up + down)

df["score_average_rating"] = df.apply(lambda x: score_average_rating(x["helpful_yes"], x["helpful_no"]), axis=1)

# wilson_lower_bound
def wilson_lower_bound(up, down, confidence=0.95):

    n = up + down
    if n == 0:
        return 0
    z = st.norm.ppf(1 - (1 - confidence) / 2)
    phat = 1.0 * up / n
    return (phat + z * z / (2 * n) - z * math.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n)) / (1 + z * z / n)

df["wilson_lower_bound"] = df.apply(lambda x: wilson_lower_bound(x["helpful_yes"], x["helpful_no"]), axis=1)

##################################################
# Step 3. Specify & Sort 20 Comments
###################################################

df.sort_values("wilson_lower_bound", ascending=False).head(20)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,day_diff,helpful_yes,total_vote,helpful_no,score_pos_neg_diff,score_average_rating,wilson_lower_bound
2031,A12B7ZMXFI6IXY,B007WTAJTO,"Hyoun Kim ""Faluzure""","[1952, 2020]",[[ UPDATE - 6/19/2014 ]]So my lovely wife boug...,5.0,UPDATED - Great w/ Galaxy S4 & Galaxy Tab 4 10...,1367366400,2013-01-05,701,1952,2020,68,1884,0.96634,0.95754
3449,AOEAD7DPLZE53,B007WTAJTO,NLee the Engineer,"[1428, 1505]",I have tested dozens of SDHC and micro-SDHC ca...,5.0,Top of the class among all (budget-priced) mic...,1348617600,2012-09-26,802,1428,1505,77,1351,0.94884,0.93652
4212,AVBMZZAFEKO58,B007WTAJTO,SkincareCEO,"[1568, 1694]",NOTE: please read the last update (scroll to ...,1.0,1 Star reviews - Micro SDXC card unmounts itse...,1375660800,2013-05-08,578,1568,1694,126,1442,0.92562,0.91214
317,A1ZQAQFYSXL5MQ,B007WTAJTO,"Amazon Customer ""Kelly""","[422, 495]","If your card gets hot enough to be painful, it...",1.0,"Warning, read this!",1346544000,2012-02-09,1032,422,495,73,349,0.85253,0.81858
4672,A2DKQQIZ793AV5,B007WTAJTO,Twister,"[45, 49]",Sandisk announcement of the first 128GB micro ...,5.0,Super high capacity!!! Excellent price (on Am...,1394150400,2014-07-03,157,45,49,4,41,0.91837,0.80811
1835,A1J6VSUM80UAF8,B007WTAJTO,goconfigure,"[60, 68]",Bought from BestBuy online the day it was anno...,5.0,I own it,1393545600,2014-02-28,282,60,68,8,52,0.88235,0.78465
3981,A1K91XXQ6ZEBQR,B007WTAJTO,"R. Sutton, Jr. ""RWSynergy""","[112, 139]",The last few days I have been diligently shopp...,5.0,"Resolving confusion between ""Mobile Ultra"" and...",1350864000,2012-10-22,776,112,139,27,85,0.80576,0.73214
3807,AFGRMORWY2QNX,B007WTAJTO,R. Heisler,"[22, 25]",I bought this card to replace a lost 16 gig in...,3.0,"Good buy for the money but wait, I had an issue!",1361923200,2013-02-27,648,22,25,3,19,0.88,0.70044
4306,AOHXKM5URSKAB,B007WTAJTO,Stellar Eller,"[51, 65]","While I got this card as a ""deal of the day"" o...",5.0,Awesome Card!,1339200000,2012-09-06,822,51,65,14,37,0.78462,0.67033
4596,A1WTQUOQ4WG9AI,B007WTAJTO,"Tom Henriksen ""Doggy Diner""","[82, 109]",Hi:I ordered two card and they arrived the nex...,1.0,Designed incompatibility/Don't support SanDisk,1348272000,2012-09-22,806,82,109,27,55,0.75229,0.66359
