In [87]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [88]:
# data transformation/preparation
df = pd.read_csv('block_9_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151254 entries, 0 to 151253
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   timestamp       151254 non-null  object 
 1   asin            151254 non-null  object 
 2   helpful         151254 non-null  object 
 3   overall         151254 non-null  float64
 4   reviewText      151232 non-null  object 
 5   reviewTime      151254 non-null  object 
 6   reviewerID      151254 non-null  object 
 7   reviewerName    149761 non-null  object 
 8   summary         151254 non-null  object 
 9   unixReviewTime  151254 non-null  int64  
 10  review_year     151254 non-null  int64  
 11  review_month    151254 non-null  int64  
 12  review_day      151254 non-null  int64  
dtypes: float64(1), int64(4), object(8)
memory usage: 15.0+ MB


In [89]:
df.head()

Unnamed: 0,timestamp,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,review_year,review_month,review_day
0,2013-06-01,616719923X,"[0, 0]",4.0,Just another flavor of Kit Kat but the taste i...,"06 1, 2013",A1VEELTKS8NLZB,Amazon Customer,Good Taste,1370044800,2013,6,1
1,2014-05-19,616719923X,"[0, 1]",3.0,I bought this on impulse and it comes from Jap...,"05 19, 2014",A14R9XMZVJ6INB,amf0001,"3.5 stars, sadly not as wonderful as I had hoped",1400457600,2014,5,19
2,2013-10-08,616719923X,"[3, 4]",4.0,Really good. Great gift for any fan of green t...,"10 8, 2013",A27IQHDZFQFNGG,Caitlin,Yum!,1381190400,2013,10,8
3,2013-05-20,616719923X,"[0, 0]",5.0,"I had never had it before, was curious to see ...","05 20, 2013",A31QY5TASILE89,DebraDownSth,Unexpected flavor meld,1369008000,2013,5,20
4,2013-05-26,616719923X,"[1, 2]",4.0,I've been looking forward to trying these afte...,"05 26, 2013",A2LWK003FFMCI5,Diana X.,"Not a very strong tea flavor, but still yummy ...",1369526400,2013,5,26


$ rating = \alpha + \beta_{1} * reviews + \beta_{2} * avaerage + u $

In [90]:
# weight 
df.loc[:, 'weight'] = 1

In [91]:
# set index
# 重新设置以 asin 开头的 index 因为很多数据都包含着相同的 asin
df.set_index(['asin', 'review_year', 'review_month', 'review_day'], inplace=True)

In [92]:
df.head().T

asin,616719923X,616719923X,616719923X,616719923X,616719923X
review_year,2013,2014,2013,2013,2013
review_month,6,5,10,5,5
review_day,1,19,8,20,26
timestamp,2013-06-01,2014-05-19,2013-10-08,2013-05-20,2013-05-26
helpful,"[0, 0]","[0, 1]","[3, 4]","[0, 0]","[1, 2]"
overall,4.0,3.0,4.0,5.0,4.0
reviewText,Just another flavor of Kit Kat but the taste i...,I bought this on impulse and it comes from Jap...,Really good. Great gift for any fan of green t...,"I had never had it before, was curious to see ...",I've been looking forward to trying these afte...
reviewTime,"06 1, 2013","05 19, 2014","10 8, 2013","05 20, 2013","05 26, 2013"
reviewerID,A1VEELTKS8NLZB,A14R9XMZVJ6INB,A27IQHDZFQFNGG,A31QY5TASILE89,A2LWK003FFMCI5
reviewerName,Amazon Customer,amf0001,Caitlin,DebraDownSth,Diana X.
summary,Good Taste,"3.5 stars, sadly not as wonderful as I had hoped",Yum!,Unexpected flavor meld,"Not a very strong tea flavor, but still yummy ..."
unixReviewTime,1370044800,1400457600,1381190400,1369008000,1369526400
weight,1,1,1,1,1


In [93]:
df.loc[:,'reviews'] = df.groupby(level=['asin'])['weight'].transform(np.cumsum)

In [94]:
df.T

asin,616719923X,616719923X,616719923X,616719923X,616719923X,616719923X,616719923X,616719923X,616719923X,616719923X,...,B00KCJRVO2,B00KCJRVO2,B00KCJRVO2,B00KCJRVO2,B00KCJRVO2,B00KCJRVO2,B00KCJRVO2,B00KCJRVO2,B00KCJRVO2,B00KCJRVO2
review_year,2013,2014,2013,2013,2013,2013,2013,2013,2013,2012,...,2014,2014,2014,2014,2014,2014,2014,2014,2014,2014
review_month,6,5,10,5,5,9,10,7,6,9,...,7,6,7,7,7,7,7,7,7,7
review_day,1,19,8,20,26,5,18,5,14,19,...,6,30,21,2,6.1,12,6.2,1,4,11
timestamp,2013-06-01,2014-05-19,2013-10-08,2013-05-20,2013-05-26,2013-09-05,2013-10-18,2013-07-05,2013-06-14,2012-09-19,...,2014-07-06,2014-06-30,2014-07-21,2014-07-02,2014-07-06,2014-07-12,2014-07-06,2014-07-01,2014-07-04,2014-07-11
helpful,"[0, 0]","[0, 1]","[3, 4]","[0, 0]","[1, 2]","[0, 1]","[1, 2]","[2, 3]","[0, 0]","[0, 10]",...,"[1, 1]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[1, 1]","[0, 1]","[0, 0]"
overall,4.0,3.0,4.0,5.0,4.0,4.0,3.0,5.0,5.0,1.0,...,5.0,3.0,5.0,4.0,5.0,4.0,4.0,5.0,4.0,4.0
reviewText,Just another flavor of Kit Kat but the taste i...,I bought this on impulse and it comes from Jap...,Really good. Great gift for any fan of green t...,"I had never had it before, was curious to see ...",I've been looking forward to trying these afte...,"These Kit-kats are very good, but if you're lo...",I found these in a Mitsuwa Marketplace in Illi...,Creamy white chocolate infused with Matcha gre...,After hearing mixed opinions about these Kit K...,"I love green tea, I love Kit Kats, but the two...",...,A lot of these quick oatmeal products don't ma...,"When it has came down to General Mills, they h...",My daughter thinks it tastes just like the reg...,It may seem surprising to see an oatmeal produ...,I like oatmeal and will often make a pot of Mc...,Delicious gluten-free oatmeal: we tried both t...,With the many selections of instant oatmeal ce...,"While I usually review CDs and DVDs, as well a...",My son and I enjoyed these oatmeal packets. H...,I like to eat oatmeal i the mornings. I usuall...
reviewTime,"06 1, 2013","05 19, 2014","10 8, 2013","05 20, 2013","05 26, 2013","09 5, 2013","10 18, 2013","07 5, 2013","06 14, 2013","09 19, 2012",...,"07 6, 2014","06 30, 2014","07 21, 2014","07 2, 2014","07 6, 2014","07 12, 2014","07 6, 2014","07 1, 2014","07 4, 2014","07 11, 2014"
reviewerID,A1VEELTKS8NLZB,A14R9XMZVJ6INB,A27IQHDZFQFNGG,A31QY5TASILE89,A2LWK003FFMCI5,A1NZJTY0BAA2SK,AA95FYFIP38RM,A3FIVHUOGMUMPK,A27FSPAMTQF1J8,A33NXNZ79H5K51,...,A11T807LX2EF00,A3W4D8XOGLWUN5,A3H0ZQ74ITU83J,A3RJR9UL7HEROC,A3VYKXHQDICC6,A2L6QS8SVHT9RG,AFJFXN42RZ3G2,ASEBX8TBYWQWA,ANKQGTXHREOI5,A2CF66KIQ3RKX3
reviewerName,Amazon Customer,amf0001,Caitlin,DebraDownSth,Diana X.,Elizabeth,"Emily Veinglory ""Book Reviewer""",greenlife,Japhyl,"Jean M ""JM""",...,Michael,"Michael Kerner ""Michael Kerner""",Mom and Teacher,Phelps Gates,philo_vance,"randomartco ""period film aficionado""","R. DelParto ""Rose2""","Steven I. Ramm ""Steve Ramm &#34;Anything Phon...",Titanium Lili,Vivian Deliz
summary,Good Taste,"3.5 stars, sadly not as wonderful as I had hoped",Yum!,Unexpected flavor meld,"Not a very strong tea flavor, but still yummy ...",Subtle,Available in some US stores,So Delicious!!,These are my favorite candies ever!,Not a fan,...,Easy to make and the taste is quite good. Exce...,"Oatmeal, But Not More",Thank you Chex!,Would have liked more information on gluten pr...,Excellent,Delicious gluten-free oatmeal 'quick' packs!,Convenient and Instant,Compares favorably in taste and texture with o...,Pretty good!,I like to eat oatmeal i the mornings
unixReviewTime,1370044800,1400457600,1381190400,1369008000,1369526400,1378339200,1382054400,1372982400,1371168000,1348012800,...,1404604800,1404086400,1405900800,1404259200,1404604800,1405123200,1404604800,1404172800,1404432000,1405036800
weight,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [95]:
# 算出每个对于商品review的平均值
df.loc[:, 'average'] = df.groupby(level=['asin'])['overall'].transform(np.cumsum)
df.loc[:, 'average'] = df['average'] / df['reviews']
df.T

asin,616719923X,616719923X,616719923X,616719923X,616719923X,616719923X,616719923X,616719923X,616719923X,616719923X,...,B00KCJRVO2,B00KCJRVO2,B00KCJRVO2,B00KCJRVO2,B00KCJRVO2,B00KCJRVO2,B00KCJRVO2,B00KCJRVO2,B00KCJRVO2,B00KCJRVO2
review_year,2013,2014,2013,2013,2013,2013,2013,2013,2013,2012,...,2014,2014,2014,2014,2014,2014,2014,2014,2014,2014
review_month,6,5,10,5,5,9,10,7,6,9,...,7,6,7,7,7,7,7,7,7,7
review_day,1,19,8,20,26,5,18,5,14,19,...,6,30,21,2,6.1,12,6.2,1,4,11
timestamp,2013-06-01,2014-05-19,2013-10-08,2013-05-20,2013-05-26,2013-09-05,2013-10-18,2013-07-05,2013-06-14,2012-09-19,...,2014-07-06,2014-06-30,2014-07-21,2014-07-02,2014-07-06,2014-07-12,2014-07-06,2014-07-01,2014-07-04,2014-07-11
helpful,"[0, 0]","[0, 1]","[3, 4]","[0, 0]","[1, 2]","[0, 1]","[1, 2]","[2, 3]","[0, 0]","[0, 10]",...,"[1, 1]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[1, 1]","[0, 1]","[0, 0]"
overall,4.0,3.0,4.0,5.0,4.0,4.0,3.0,5.0,5.0,1.0,...,5.0,3.0,5.0,4.0,5.0,4.0,4.0,5.0,4.0,4.0
reviewText,Just another flavor of Kit Kat but the taste i...,I bought this on impulse and it comes from Jap...,Really good. Great gift for any fan of green t...,"I had never had it before, was curious to see ...",I've been looking forward to trying these afte...,"These Kit-kats are very good, but if you're lo...",I found these in a Mitsuwa Marketplace in Illi...,Creamy white chocolate infused with Matcha gre...,After hearing mixed opinions about these Kit K...,"I love green tea, I love Kit Kats, but the two...",...,A lot of these quick oatmeal products don't ma...,"When it has came down to General Mills, they h...",My daughter thinks it tastes just like the reg...,It may seem surprising to see an oatmeal produ...,I like oatmeal and will often make a pot of Mc...,Delicious gluten-free oatmeal: we tried both t...,With the many selections of instant oatmeal ce...,"While I usually review CDs and DVDs, as well a...",My son and I enjoyed these oatmeal packets. H...,I like to eat oatmeal i the mornings. I usuall...
reviewTime,"06 1, 2013","05 19, 2014","10 8, 2013","05 20, 2013","05 26, 2013","09 5, 2013","10 18, 2013","07 5, 2013","06 14, 2013","09 19, 2012",...,"07 6, 2014","06 30, 2014","07 21, 2014","07 2, 2014","07 6, 2014","07 12, 2014","07 6, 2014","07 1, 2014","07 4, 2014","07 11, 2014"
reviewerID,A1VEELTKS8NLZB,A14R9XMZVJ6INB,A27IQHDZFQFNGG,A31QY5TASILE89,A2LWK003FFMCI5,A1NZJTY0BAA2SK,AA95FYFIP38RM,A3FIVHUOGMUMPK,A27FSPAMTQF1J8,A33NXNZ79H5K51,...,A11T807LX2EF00,A3W4D8XOGLWUN5,A3H0ZQ74ITU83J,A3RJR9UL7HEROC,A3VYKXHQDICC6,A2L6QS8SVHT9RG,AFJFXN42RZ3G2,ASEBX8TBYWQWA,ANKQGTXHREOI5,A2CF66KIQ3RKX3
reviewerName,Amazon Customer,amf0001,Caitlin,DebraDownSth,Diana X.,Elizabeth,"Emily Veinglory ""Book Reviewer""",greenlife,Japhyl,"Jean M ""JM""",...,Michael,"Michael Kerner ""Michael Kerner""",Mom and Teacher,Phelps Gates,philo_vance,"randomartco ""period film aficionado""","R. DelParto ""Rose2""","Steven I. Ramm ""Steve Ramm &#34;Anything Phon...",Titanium Lili,Vivian Deliz
summary,Good Taste,"3.5 stars, sadly not as wonderful as I had hoped",Yum!,Unexpected flavor meld,"Not a very strong tea flavor, but still yummy ...",Subtle,Available in some US stores,So Delicious!!,These are my favorite candies ever!,Not a fan,...,Easy to make and the taste is quite good. Exce...,"Oatmeal, But Not More",Thank you Chex!,Would have liked more information on gluten pr...,Excellent,Delicious gluten-free oatmeal 'quick' packs!,Convenient and Instant,Compares favorably in taste and texture with o...,Pretty good!,I like to eat oatmeal i the mornings
unixReviewTime,1370044800,1400457600,1381190400,1369008000,1369526400,1378339200,1382054400,1372982400,1371168000,1348012800,...,1404604800,1404086400,1405900800,1404259200,1404604800,1405123200,1404604800,1404172800,1404432000,1405036800
weight,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


## OLS

In [96]:
fml = 'overall ~ reviews + average'
ols = smf.ols(fml, data=df).fit()

In [97]:
print(ols.summary())

                            OLS Regression Results                            
Dep. Variable:                overall   R-squared:                       0.287
Model:                            OLS   Adj. R-squared:                  0.287
Method:                 Least Squares   F-statistic:                 3.040e+04
Date:                Sun, 09 Oct 2022   Prob (F-statistic):               0.00
Time:                        22:24:00   Log-Likelihood:            -2.0210e+05
No. Observations:              151254   AIC:                         4.042e+05
Df Residuals:                  151251   BIC:                         4.042e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0141      0.018     -0.773      0.4

In [99]:
df.head().T

asin,616719923X,616719923X,616719923X,616719923X,616719923X
review_year,2013,2014,2013,2013,2013
review_month,6,5,10,5,5
review_day,1,19,8,20,26
timestamp,2013-06-01,2014-05-19,2013-10-08,2013-05-20,2013-05-26
helpful,"[0, 0]","[0, 1]","[3, 4]","[0, 0]","[1, 2]"
overall,4.0,3.0,4.0,5.0,4.0
reviewText,Just another flavor of Kit Kat but the taste i...,I bought this on impulse and it comes from Jap...,Really good. Great gift for any fan of green t...,"I had never had it before, was curious to see ...",I've been looking forward to trying these afte...
reviewTime,"06 1, 2013","05 19, 2014","10 8, 2013","05 20, 2013","05 26, 2013"
reviewerID,A1VEELTKS8NLZB,A14R9XMZVJ6INB,A27IQHDZFQFNGG,A31QY5TASILE89,A2LWK003FFMCI5
reviewerName,Amazon Customer,amf0001,Caitlin,DebraDownSth,Diana X.
summary,Good Taste,"3.5 stars, sadly not as wonderful as I had hoped",Yum!,Unexpected flavor meld,"Not a very strong tea flavor, but still yummy ..."
unixReviewTime,1370044800,1400457600,1381190400,1369008000,1369526400
weight,1,1,1,1,1


## robust regression

In [100]:
df.reset_index(inplace=True)
df.head().T

Unnamed: 0,0,1,2,3,4
asin,616719923X,616719923X,616719923X,616719923X,616719923X
review_year,2013,2014,2013,2013,2013
review_month,6,5,10,5,5
review_day,1,19,8,20,26
timestamp,2013-06-01,2014-05-19,2013-10-08,2013-05-20,2013-05-26
helpful,"[0, 0]","[0, 1]","[3, 4]","[0, 0]","[1, 2]"
overall,4.0,3.0,4.0,5.0,4.0
reviewText,Just another flavor of Kit Kat but the taste i...,I bought this on impulse and it comes from Jap...,Really good. Great gift for any fan of green t...,"I had never had it before, was curious to see ...",I've been looking forward to trying these afte...
reviewTime,"06 1, 2013","05 19, 2014","10 8, 2013","05 20, 2013","05 26, 2013"
reviewerID,A1VEELTKS8NLZB,A14R9XMZVJ6INB,A27IQHDZFQFNGG,A31QY5TASILE89,A2LWK003FFMCI5


In [101]:
fml = 'overall ~ reviews + average'
robust = smf.ols(
    fml, data=df).fit(
        cov_type='cluster', 
        cov_kwds={'groups': np.array(df['asin'])})

In [102]:
print(robust.summary())

                            OLS Regression Results                            
Dep. Variable:                overall   R-squared:                       0.287
Model:                            OLS   Adj. R-squared:                  0.287
Method:                 Least Squares   F-statistic:                 3.159e+04
Date:                Sun, 09 Oct 2022   Prob (F-statistic):               0.00
Time:                        22:26:17   Log-Likelihood:            -2.0210e+05
No. Observations:              151254   AIC:                         4.042e+05
Df Residuals:                  151251   BIC:                         4.042e+05
Df Model:                           2                                         
Covariance Type:              cluster                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0141      0.019     -0.731      0.4

## discrete choice models

In [103]:
# low rating 
# 1. create a new variable called very_low and initially assign it to zero 
df.loc[:, "very_low"] = 0

# 2. when overall is equal to 1 then it is actually ver low
df.loc[df["overall"] == 1, "very_low"] = 1


In [106]:
fml = "very_low ~ reviews + average"
from statsmodels.formula.api import logit
robust = logit(fml, data=df).fit()
print(robust.summary())

Optimization terminated successfully.
         Current function value: 0.134610
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:               very_low   No. Observations:               151254
Model:                          Logit   Df Residuals:                   151251
Method:                           MLE   Df Model:                            2
Date:                Sun, 09 Oct 2022   Pseudo R-squ.:                  0.1701
Time:                        22:27:57   Log-Likelihood:                -20360.
converged:                       True   LL-Null:                       -24534.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      3.3637      0.072     46.619      0.000       3.222       3.505
reviews       -0.0011      0.

## count models

$ reviews = \alpha + \beta_{1} * average + \mu $

In [None]:
# formal 
fml = 'reviews ~ average' 
from statsmodels.formula.api import poisson
poisson = smf.poisson(fml, data=df).fit()
print(poisson.summary())