# Sorting Reviews

In [34]:
import numpy as np
import pandas as pd
import math
import scipy.stats as st

## Type 1: Up Down Difference Score

Up-Down Diff Score = (Up Ratings) - (Down Ratings)

In [5]:
def score_up_down_diff(up, down):
    return (up - down) 

**Review 1 Score**

In [7]:
score_up_down_diff(600, 400)

200

**Review 2 Score**

In [8]:
score_up_down_diff(5500, 4500)

1000

## Type 2: Average Rating (Up Ratio)

Average Rating = Up / Total

In [13]:
def score_average_rating(up, down):
    if (up + down) == 0:
        return 0
    return up / (up + down)

In [14]:
score_average_rating(600, 400)

0.6

In [16]:
score_average_rating(5500, 4500)

0.55

In [29]:
score_average_rating(2, 0) # it is not realistic

1.0

## Type 3: Wilson Lower Bound Score 

**This is the best way for sorting reviews. Because type 1 and type 2 ignore the frequency.**

In [22]:
def wilson_lower_bound(up, down, confidence=0.95):
    
    n = up + down
    if n == 0:
        return 0
    z = st.norm.ppf(1 - (1 - confidence) / 2)
    phat = 1.0 * up / n
    score = (phat + z * z / (2 * n) - z * math.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n)) / (1 + z * z / n)
    
    return score

In [23]:
wilson_lower_score(600, 400)

0.5693094295142663

In [25]:
wilson_lower_score(5500, 4500)

0.5402319557715324

In [26]:
wilson_lower_score(2, 0)

0.3423802275066531

In [30]:
wilson_lower_score(95, 5)

0.8882495307680808

In [33]:
wilson_lower_score(0, 1)

0.0

# Sample Case

In [43]:
up = np.random.randint(0, 150, 50)
down = np.random.randint(0, 50, 50)

In [45]:
comments = pd.DataFrame({'up': up, 'down': down})

In [47]:
comments.head()

Unnamed: 0,up,down
0,100,48
1,138,25
2,60,47
3,89,28
4,95,27


In [52]:
comments['score_up_down_diff'] = comments.apply(lambda x: score_up_down_diff(x['up'], x['down']), axis=1)

In [53]:
comments['score_average_rating'] = comments.apply(lambda x: score_average_rating(x['up'], x['down']), axis=1)

In [54]:
comments['wilson_lower_bound'] = comments.apply(lambda x: wilson_lower_bound(x['up'], x['down']), axis=1)

In [55]:
comments.head()

Unnamed: 0,up,down,score_up_down_diff,score_average_rating,wilson_lower_bound
0,100,48,52,0.675676,0.596641
1,138,25,113,0.846626,0.783387
2,60,47,13,0.560748,0.466226
3,89,28,61,0.760684,0.675874
4,95,27,68,0.778689,0.697154


**Sort values by up down difference**

In [57]:
comments.sort_values('score_up_down_diff', ascending=False).head(10)

Unnamed: 0,up,down,score_up_down_diff,score_average_rating,wilson_lower_bound
45,147,6,141,0.960784,0.917092
8,146,13,133,0.918239,0.865148
30,137,7,130,0.951389,0.903064
5,138,12,126,0.92,0.865379
42,141,16,125,0.898089,0.84087
15,127,3,124,0.976923,0.934348
25,124,1,123,0.992,0.956075
19,140,24,116,0.853659,0.791483
1,138,25,113,0.846626,0.783387
32,112,2,110,0.982456,0.938282


**Sort values by average rating**

In [58]:
comments.sort_values('score_average_rating', ascending=False).head(10)

Unnamed: 0,up,down,score_up_down_diff,score_average_rating,wilson_lower_bound
6,88,0,88,1.0,0.958173
25,124,1,123,0.992,0.956075
46,84,1,83,0.988235,0.936328
23,60,1,59,0.983607,0.912811
32,112,2,110,0.982456,0.938282
15,127,3,124,0.976923,0.934348
45,147,6,141,0.960784,0.917092
28,46,2,44,0.958333,0.860243
30,137,7,130,0.951389,0.903064
5,138,12,126,0.92,0.865379


**Sort values by wilson lower bound score**

In [61]:
comments.sort_values('wilson_lower_bound', ascending=False).head(10)

Unnamed: 0,up,down,score_up_down_diff,score_average_rating,wilson_lower_bound
6,88,0,88,1.0,0.958173
25,124,1,123,0.992,0.956075
32,112,2,110,0.982456,0.938282
46,84,1,83,0.988235,0.936328
15,127,3,124,0.976923,0.934348
45,147,6,141,0.960784,0.917092
23,60,1,59,0.983607,0.912811
30,137,7,130,0.951389,0.903064
5,138,12,126,0.92,0.865379
8,146,13,133,0.918239,0.865148
