In [7]:
%pylab inline
import numpy as np
import matplotlib.pyplot as plt
import re
from collections import Counter
import gzip
from collections import defaultdict
import copy
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier


Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


### Reading in the Data

In [8]:
def readGz(f):
  for l in gzip.open(f):
    yield eval(l)

allHelpful = [] ## total number of helpful ratings
userHelpful = defaultdict(list)

for l in readGz("train.json.gz"):
  user,item = l['reviewerID'],l['itemID']
  allHelpful.append(l['helpful'])
  userHelpful[user].append(l['helpful'])

averageRate = sum([x['nHelpful'] for x in allHelpful]) * 1.0 / sum([x['outOf'] for x in allHelpful])
## average helpfulness across all users: 0.8519720886532813
userRate = {}   ## dictionary of users and their average helpfulness. helpfulness rating is average of all users if user
                ## has no helpfulness data on them
for u in userHelpful:
  totalU = sum([x['outOf'] for x in userHelpful[u]])
  if totalU > 0:
    userRate[u] = sum([x['nHelpful'] for x in userHelpful[u]]) * 1.0 / totalU
  else:
    userRate[u] = averageRate

# predictions = open("predictions_Helpful.txt", 'w')
# for l in open("pairs_Helpful.txt"):
#   if l.startswith("userID"):
#     #header
#     predictions.write(l)
#     continue
#   u,i,outOf = l.strip().split('-')
#   outOf = int(outOf)
# #   if outOf == 0:
# #     predictions.write(u + '-' + i + '-' + str(outOf) + ',' + str(0) + '\n')
#   if u in userRate:
#     predictions.write(u + '-' + i + '-' + str(outOf) + ',' + str(outOf*userRate[u]) + '\n')
#   else:
#     predictions.write(u + '-' + i + '-' + str(outOf) + ',' + str(outOf*averageRate) + '\n')

# predictions.close()

### Creating syllable counter

In [9]:
from nltk.corpus import cmudict
d = cmudict.dict() 
def nsyl(word):
    max_syl = 0
    if word.lower() in d:
        for syl_group in d[word.lower()]:
            tot_syl = 0
            for syl in syl_group:
                if str(syl[-1]).isdigit():
                    tot_syl += 1
            max_syl = max(max_syl,tot_syl)
    return max_syl

In [10]:
import pandas as pd
from collections import defaultdict

def readGz(f):
  for l in gzip.open(f):
    yield eval(l)

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

train_df = getDF('train.json.gz')
test_df = getDF('test_Helpful.json.gz')

### Splitting text into sentences

In [11]:
caps = "([A-Z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"
digits = "([0-9])"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + caps + "[.]"," \\1<prd>",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return len(sentences)

# len(split_into_sentences(filtered_df.loc[14]['reviewText']))

In [12]:
def categorybinner (df):
    newdf = df
    newdf['cat0'] = np.where((newdf['categoryID']==0), 1, 0)
    newdf['cat1'] = np.where((newdf['categoryID']==1), 1, 0)
    newdf['cat2'] = np.where((newdf['categoryID']==2), 1, 0)
    newdf['cat3'] = np.where((newdf['categoryID']==3), 1, 0)
    newdf['cat4'] = np.where((newdf['categoryID']==4), 1, 0)
    return newdf

### Split our training data into multiple dataframes based on how many reviews

In [13]:
def training_dataframe_prepper(df):
    df['num_helpful'] = df['helpful'].apply(lambda x: x['nHelpful'])
    df['outof'] = df['helpful'].apply(lambda x: x['outOf'])
    df['helpfulratio'] = df['num_helpful']/df['outof']
    df = df[df.helpfulratio.notnull()]
    df['num_reviews_written'] = df['reviewerID'].map(df['reviewerID'].value_counts().to_dict())
    df['num_item_reviews'] = df['itemID'].map(df['itemID'].value_counts().to_dict())
    df['num_words'] = df['reviewText'].apply(lambda x: len(re.findall(r'\w+',x)))
    df['num_sentences'] = df['reviewText'].apply(lambda x: split_into_sentences(x) if split_into_sentences(x) != 0 else 1)
    df['num_syllables'] = df['reviewText'].apply(lambda x: sum([nsyl(b) for b in (re.findall(r'\w+',x))]))
    df['num_chars'] = df['reviewText'].apply(lambda x: sum([len(b) for b in (re.findall(r'\w+',x))]))
    df['Automated_Readability_Index'] = 4.71*df['num_chars']/df['num_words'] + 0.5*df['num_words']/df['num_sentences'] - 21.43
    df = df.drop(['num_syllables','num_chars'],axis=1)
    df['summ_num_words'] = df['summary'].apply(lambda x: len(re.findall(r'\w+',x)))
    df['conciseness'] = df['summ_num_words']/df['num_words']
    df = df.drop(['summ_num_words','num_words'],1)
    df = categorybinner(df)
    df = df[(df['outof']>0)]
    Ones_df = df[(df['outof'] ==1)]
    Low_Outof = df[(df['outof'] <=10) & (df['outof']>=1)]
    More_Outof = df[(df['outof']<=500)&(df['outof']>=57)]
    Less_Outof = df[(df['outof']<57) & (df['outof']>10)]
    
    
    return More_Outof, Less_Outof, Low_Outof, Ones_df

In [14]:
train_df = getDF('train.json.gz')
High_oo_df, Less_oo_df,Low_oo_df, Ones_oo_df = training_dataframe_prepper(train_df)
Highlabels = High_oo_df['helpfulratio'].values
Lesslabels = Less_oo_df['helpfulratio'].values
Lowlabels = Low_oo_df['helpfulratio'].values
Oneslabels = Ones_oo_df['helpfulratio'].values
High_oo_df = High_oo_df.drop(['num_helpful','categoryID','categories','itemID','helpful','reviewerID',
                'reviewText','reviewHash','reviewTime','summary',
                'unixReviewTime','helpfulratio','price'],1)
Less_oo_df = Less_oo_df.drop(['num_helpful','categoryID','categories','itemID','helpful','reviewerID','helpfulratio',
                'reviewText','reviewHash','reviewTime','summary',
                'unixReviewTime','price'],1)
Low_oo_df = Low_oo_df.drop(['num_helpful','categoryID','categories','itemID','helpful','reviewerID','helpfulratio',
                'reviewText','reviewHash','reviewTime','summary',
                'unixReviewTime','price'],1)
Ones_oo_df = Ones_oo_df.drop(['num_helpful','categoryID','categories','itemID','helpful','reviewerID','helpfulratio',
                'reviewText','reviewHash','reviewTime','summary',
                'unixReviewTime','price'],1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

### Creating test dataframe

In [15]:
def test_dataframe_prepper(df):

    df['outof'] = df['helpful'].apply(lambda x: x['outOf'])
    df = df.drop('helpful',1)
    df = df[df['outof'] !=  0]

    df['num_reviews_written'] = df['reviewerID'].map(df['reviewerID'].value_counts().to_dict())
    df['num_item_reviews'] = df['itemID'].map(df['itemID'].value_counts().to_dict())
    df['num_words'] = df['reviewText'].apply(lambda x: len(re.findall(r'\w+',x)))
    df['num_sentences'] = df['reviewText'].apply(lambda x: split_into_sentences(x) if split_into_sentences(x) != 0 else 1)
    df['num_syllables'] = df['reviewText'].apply(lambda x: sum([nsyl(b) for b in (re.findall(r'\w+',x))]))
    df['num_chars'] = df['reviewText'].apply(lambda x: sum([len(b) for b in (re.findall(r'\w+',x))]))
    df['Automated_Readability_Index'] = 4.71*df['num_chars']/df['num_words'] + 0.5*df['num_words']/df['num_sentences'] - 21.43
    df = df.drop(['num_syllables','num_chars'],axis=1)   
    df['summ_num_words'] = df['summary'].apply(lambda x: len(re.findall(r'\w+',x)))
    df['conciseness'] = df['summ_num_words']/df['num_words']
    df = df.drop(['summ_num_words','num_words'],1)
    df = categorybinner(df)

    return df 

In [16]:
test_df = getDF('test_Helpful.json.gz')
test_df = test_dataframe_prepper(test_df)
print(test_df.price.isnull().sum())

test_df = test_df.drop(['categoryID','categories',
                            'reviewText','reviewHash','reviewTime',
                        'summary','unixReviewTime','price'],1)

2695


In [17]:
test_df.head()

Unnamed: 0,itemID,reviewerID,rating,outof,num_reviews_written,num_item_reviews,num_sentences,Automated_Readability_Index,conciseness,cat0,cat1,cat2,cat3,cat4
0,I520932398,U816789534,3.0,2,1,1,2,4.66,0.068966,1,0,0,0,0
2,I149943341,U628436634,5.0,1,1,1,2,1.931304,0.173913,1,0,0,0,0
3,I909025835,U924107228,5.0,1,1,1,13,0.781891,0.028169,1,0,0,0,0
5,I408726477,U545844741,5.0,2,1,3,3,0.461429,0.142857,1,0,0,0,0
7,I353318513,U264684350,5.0,2,1,1,11,7.321532,0.004566,0,1,0,0,0


In [18]:
Less_oo_df.head()

Unnamed: 0,rating,outof,num_reviews_written,num_item_reviews,num_sentences,Automated_Readability_Index,conciseness,cat0,cat1,cat2,cat3,cat4
37,5.0,24,5,3,6,6.154583,0.116071,1,0,0,0,0
67,5.0,11,5,2,14,4.31748,0.027778,1,0,0,0,0
75,5.0,22,7,9,3,11.129839,0.024096,1,0,0,0,0
161,5.0,22,3,19,5,3.742192,0.027397,1,0,0,0,0
268,2.0,39,6,6,5,3.655882,0.044118,1,0,0,0,0


### Running Grid Search to determine best parameters for Gradient Boosted Classifier for datapoint where only one review

In [19]:
def GBCgridsearch (df, labels):
    xtrain, xvalid, ytrain, yvalid = train_test_split(df, labels, test_size = 0.3, random_state=np.random.randint(0,100))
#     parameters = {'loss':['deviance','exponential'],'n_estimators':[120,140,160], 'learning_rate':[0.1,0.2], 'max_depth':[2,3,4]}
    parameters = {'loss':['deviance'],'n_estimators':[120,140,160], 'learning_rate':[0.1,0.2], 'max_depth':[2,3,4]}
    gbc = GradientBoostingClassifier()
    clf = GridSearchCV(gbc, parameters, verbose=True, n_jobs=-1, refit=True)
    clf.fit(xtrain, ytrain)
#     y_pred = clf.predict(xvalid)
    train_acc = mean_absolute_error(clf.predict(xvalid), yvalid)
    train_acc1 = mean_squared_error(clf.predict(xvalid), yvalid)
    print ('MAE = ' + str(train_acc)) 
    print ('MSE = ' + str(train_acc1)) 
    print (clf.best_params_)

In [20]:
print('\n' + ' outof ones')
GBCgridsearch(Ones_oo_df, Oneslabels)


 outof ones
Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   24.5s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:   33.0s finished


MAE = 0.246789633435
MSE = 0.246789633435
{'learning_rate': 0.2, 'loss': 'deviance', 'max_depth': 2, 'n_estimators': 120}


### Running Grid Search for Gradient Boosted Regressor. Used for rest of the data

In [21]:
def GBRgridsearch (df, labels):
    xtrain, xvalid, ytrain, yvalid = train_test_split(df, labels, test_size = 0.3, random_state=np.random.randint(0,100))
    parameters = {'loss':['ls', 'lad', 'huber', 'quantile'],'n_estimators':[120,140,160], 'learning_rate':[0.1,0.2], 'max_depth':[3,4,7,8]}
    gbc = GradientBoostingRegressor()
    clf = GridSearchCV(gbc, parameters, verbose=True, n_jobs=7, refit=True)
    clf.fit(xtrain, ytrain)
#     y_pred = clf.predict(xvalid)
    train_acc = mean_absolute_error(clf.predict(xvalid), yvalid)
    train_acc1 = mean_squared_error(clf.predict(xvalid), yvalid)
    print ('MAE = ' + str(train_acc)) 
    print ('MSE = ' + str(train_acc1)) 
    print (clf.best_params_)

In [22]:
print('high outof')
GBRgridsearch(High_oo_df,Highlabels) ## 0.0417611
print('\n' + 'less outof')
GBRgridsearch(Less_oo_df,Lesslabels) ## 0.0799364
print('\n' + 'low outof')
GBRgridsearch(Low_oo_df, Lowlabels) ## 0.27929156
print('\n' + 'Ones outof')
GBRgridsearch(Ones_oo_df,Oneslabels)## 0.3543893944

high outof
Fitting 3 folds for each of 96 candidates, totalling 288 fits


[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    1.6s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:   11.0s
[Parallel(n_jobs=7)]: Done 288 out of 288 | elapsed:   16.3s finished


MAE = 0.0555958769645
MSE = 0.020090597375
{'learning_rate': 0.1, 'loss': 'quantile', 'max_depth': 8, 'n_estimators': 160}

less outof
Fitting 3 folds for each of 96 candidates, totalling 288 fits


[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    8.2s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:   39.9s
[Parallel(n_jobs=7)]: Done 288 out of 288 | elapsed:   59.6s finished


MAE = 0.0846352710911
MSE = 0.0163670303372
{'learning_rate': 0.1, 'loss': 'ls', 'max_depth': 3, 'n_estimators': 120}

low outof
Fitting 3 folds for each of 96 candidates, totalling 288 fits


[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:  2.1min
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:  8.7min
[Parallel(n_jobs=7)]: Done 288 out of 288 | elapsed: 12.8min finished


MAE = 0.279986121859
MSE = 0.119560277579
{'learning_rate': 0.1, 'loss': 'ls', 'max_depth': 3, 'n_estimators': 120}

Ones outof
Fitting 3 folds for each of 96 candidates, totalling 288 fits


[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:   41.4s
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed:  3.2min
[Parallel(n_jobs=7)]: Done 288 out of 288 | elapsed:  4.9min finished


MAE = 0.354963537013
MSE = 0.179051754529
{'learning_rate': 0.1, 'loss': 'ls', 'max_depth': 3, 'n_estimators': 120}


In [23]:
def skgbr(x_train, x_test, y_train, y_test):
    LNR = GradientBoostingRegressor(learning_rate=0.1, loss='ls',max_depth=3,n_estimators=140)
    LNR.fit(x_train, y_train)
    LNR_pred = LNR.predict(x_test)
    mse = mean_squared_error(y_test, LNR_pred)
    mae = mean_absolute_error(y_test, LNR_pred)
    return [mse,mae]
print('ablation')
cols = Low_oo_df.columns

for i in cols:
    dbt1 = Low_oo_df.drop(i, axis=1, inplace=False)
    dbtx_train2, dbtx_test2, dbty_train2, dbty_test2 = train_test_split(dbt1, Lowlabels, test_size=0.1, random_state=10)
    err1 = skgbr(dbtx_train2, dbtx_test2, dbty_train2, dbty_test2)
    print('Mean Absolute Error removing feature {}: {}'.format(i, err1[1]))

ablation
Mean Absolute Error removing feature rating: 0.2877196910017685
Mean Absolute Error removing feature outof: 0.2809878044689645
Mean Absolute Error removing feature num_reviews_written: 0.28089869778620397
Mean Absolute Error removing feature num_item_reviews: 0.28238237296921764
Mean Absolute Error removing feature num_sentences: 0.2813237860588637
Mean Absolute Error removing feature Automated_Readability_Index: 0.2804551326775394
Mean Absolute Error removing feature conciseness: 0.280665965733452
Mean Absolute Error removing feature cat0: 0.28084383817570696
Mean Absolute Error removing feature cat1: 0.28072456880751895
Mean Absolute Error removing feature cat2: 0.28090067491007326
Mean Absolute Error removing feature cat3: 0.280938971970181
Mean Absolute Error removing feature cat4: 0.28087892430608147


In [22]:
GBRhigh = GradientBoostingRegressor(learning_rate=0.1, loss='quantile',max_depth=8,n_estimators=160)
GBRhigh.fit(High_oo_df,Highlabels)
GBRless = GradientBoostingRegressor(learning_rate=0.1, loss='ls',max_depth=3,n_estimators=120)
GBRless.fit(Less_oo_df,Lesslabels)
GBRlow = GradientBoostingRegressor(learning_rate=0.1, loss='ls',max_depth=3,n_estimators=120)
GBRlow.fit (Low_oo_df,Lowlabels)
GBCones = GradientBoostingClassifier(learning_rate= 0.1, loss='deviance', max_depth= 2, n_estimators= 120)
GBCones.fit(Ones_oo_df,Oneslabels)
print('creating regressors/classifier')

creating regressors/classifier


In [25]:
predictions = open("predictions_Helpful_submission2.txt", 'w')

for l in open("pairs_Helpful.txt"):
    if l.startswith("userID"):
        #header
        predictions.write(l)
        continue
    u,i,outOf = l.strip().split('-')
    outOf = int(outOf)
    ## WRITE A BETTER REGRESSOR FOR OUTOF LESS THAN 10
    if outOf == 0:
        predictions.write(u + '-' + i + '-' + str(outOf) + ',' + str(0) + '\n')
    elif outOf ==1:
        newdf = test_df.loc[(test_df['itemID'] == i) & (test_df['reviewerID'] == u)].drop(['itemID','reviewerID'],1)
        pred = GBCones.predict(newdf)[0]
        
        predictions.write(u + '-' + i + '-' + str(outOf) + ',' + str(pred) + '\n')
        
    elif outOf >=2 and outOf <=10:
        newdf = test_df.loc[(test_df['itemID'] == i) & (test_df['reviewerID'] == u)].drop(['itemID','reviewerID'],1)
        pred = GBRlow.predict(newdf)[0]
        if pred > 1:
            pred = 1
        pred = round(pred, 2)
        predictions.write(u + '-' + i + '-' + str(outOf) + ',' + str(round(pred*outOf)) + '\n')
#         predictions.write(u + '-' + i + '-' + str(outOf) + ',' + str(float(guessings[outOf])*outOf) + '\n')
        
    elif outOf <57 and outOf > 10:
        newdf = test_df.loc[(test_df['itemID'] == i) & (test_df['reviewerID'] == u)].drop(['itemID','reviewerID'],1)
        pred = GBRless.predict(newdf)[0]
        if pred > 1:
            pred = 1
        pred = round(pred, 2)
        predictions.write(u + '-' + i + '-' + str(outOf) + ',' + str(round(pred*outOf)) + '\n')
        
    elif outOf >=57:# and outOf > 10:
        newdf = test_df.loc[(test_df['itemID'] == i) & (test_df['reviewerID'] == u)].drop(['itemID','reviewerID'],1)
        pred = GBRhigh.predict(newdf)[0]
        if pred > 1:
            pred = 1
        pred = round(pred, 2)
        predictions.write(u + '-' + i + '-' + str(outOf) + ',' + str(round(pred*outOf)) + '\n')

predictions.close()