In [440]:
import re
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.svm import SVC
from sklearn.svm import LinearSVC as LSVC
from sklearn.decomposition import TruncatedSVD as TSVD
from sklearn.decomposition import PCA
from scipy.sparse import hstack
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

# SK-learn libraries for model selection 
from sklearn.model_selection import train_test_split

# json libraries to parse json file
import json
from pandas.io.json import json_normalize

import gensim
from gensim import utils


In [2]:
os.chdir("/Users/gurditchahal/W207/coursework/Final_Project_Random_Acts/EDA")
# read json file
train_json = json.load(open('train.json'))

# normalize data and put in a dataframe
train_json_df = json_normalize(train_json)

# read json file
test_json = json.load(open('test.json'))

# normalize data and put in a dataframe
test_json_df = json_normalize(test_json)

print("Train shape: ", train_json_df.shape)
print("Test shape: ", test_json_df.shape)

train_only_columns = set(train_json_df.columns.values)-set(test_json_df.columns.values)
print("Columns in Train but not Test:\n",train_only_columns)
test_only_columns = set(test_json_df.columns.values)-set(train_json_df.columns.values)
print("\nColumns in Test but not Train:",test_only_columns)
test_w_train_col = train_json_df[test_json_df.columns.values]

Train shape:  (4040, 32)
Test shape:  (1631, 17)
Columns in Train but not Test:
 {'requester_upvotes_plus_downvotes_at_retrieval', 'post_was_edited', 'number_of_upvotes_of_request_at_retrieval', 'requester_number_of_posts_on_raop_at_retrieval', 'request_text', 'request_number_of_comments_at_retrieval', 'requester_received_pizza', 'requester_upvotes_minus_downvotes_at_retrieval', 'requester_number_of_comments_in_raop_at_retrieval', 'requester_user_flair', 'requester_number_of_comments_at_retrieval', 'requester_number_of_posts_at_retrieval', 'number_of_downvotes_of_request_at_retrieval', 'requester_account_age_in_days_at_retrieval', 'requester_days_since_first_post_on_raop_at_retrieval'}

Columns in Test but not Train: set()


In [338]:
set(train_json_df.columns.values).intersection(set(test_json_df.columns.values))

{'giver_username_if_known',
 'request_id',
 'request_text_edit_aware',
 'request_title',
 'requester_account_age_in_days_at_request',
 'requester_days_since_first_post_on_raop_at_request',
 'requester_number_of_comments_at_request',
 'requester_number_of_comments_in_raop_at_request',
 'requester_number_of_posts_at_request',
 'requester_number_of_posts_on_raop_at_request',
 'requester_number_of_subreddits_at_request',
 'requester_subreddits_at_request',
 'requester_upvotes_minus_downvotes_at_request',
 'requester_upvotes_plus_downvotes_at_request',
 'requester_username',
 'unix_timestamp_of_request',
 'unix_timestamp_of_request_utc'}

In [296]:
# 0 means the user doesn't receive pizza & 1 means the user receives pizza
train_labels = train_json_df.requester_received_pizza.astype(int).as_matrix()
previous_givers=set(train_json_df["giver_username_if_known"])
previous_givers.remove('N/A')
# split the training data into training data and dev data 
train_data, dev_data, train_labels, dev_labels = \
            train_test_split(test_w_train_col, train_labels, test_size=0.2, random_state=12)
    

In [297]:
train_data['full_text']=train_data['request_text_edit_aware'] +' '+train_data['request_title']
dev_data['full_text']=dev_data['request_text_edit_aware'] +' '+dev_data['request_title']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [158]:
# Narratives per Standford paper
money = ["money", "now", "broke", "week", "until", "time",
          "last", "day", "when", "today", "tonight", "paid", "next",
          "first", "night", "after", "tomorrow", "month", "while",
          "account", "before", "long", "Friday", "rent", "buy",
          "bank", "still", "bills", "ago", "cash", "due",
          "soon", "past", "never", "paycheck", "check", "spent",
          "years", "poor", "till", "yesterday", "morning", "dollars",
          "financial", "hour", "bill", "evening", "credit",
          "budget", "loan", "bucks", "deposit", "dollar", "current",
          "payed"]
job =["work", "job", "paycheck", "unemployment", "interview",
          "fired", "employment", "hired", "hire"]
student = ["college", "student", "school", "roommate",
          "studying", "university", "finals", "semester",
          "class", "study", "project", "dorm", "tuition"]
family =["family", "mom", "wife", "parents", "mother", "husband",
           "dad", "son", "daughter", "father", "parent",
           "mum"]
craving = ["friend", "girlfriend", "craving", "birthday",
          "boyfriend", "celebrate", "party", "game", "games",
          "movie", "date", "drunk", "beer", "celebrating", "invited",
          "drinks", "crave", "wasted", "invite"]

narratives = [money, job, student, family, craving]




In [230]:
import string
from nltk.stem.porter import PorterStemmer
def pre_proccess(s):
    s = re.sub("[^\w']|_", " ", s) 
    s=s.translate(str.maketrans(' ',' ',string.punctuation))#strip punctuation before looking
    s= re.sub(' +',' ', s) #remove extra spaces
    s=s.lower()
   # p_stemmer = PorterStemmer()
    #s = ' '.join([p_stemmer.stem(i) for i in s.split()])    
    return s


In [298]:
#https://www.timeanddate.com/calendar/aboutseasons.html
def ts_to_season(month):
    if month>=3 and month<=5:
        return "spring"
    elif month>=6 and month <=8:
        return "summer"
    elif month>=9 and month <=11:
        return "fall"
    else:
        return "winter"

In [387]:
def construct_ft_mat(train_data):
    feat_mat=pd.DataFrame()
    
    #temporal features
    feat_mat['hour_request']=pd.to_datetime(train_data['unix_timestamp_of_request_utc'],unit = 's').dt.hour
    feat_mat['day_request']=pd.to_datetime(train_data['unix_timestamp_of_request_utc'],unit = 's').dt.day
    feat_mat['day_request']=feat_mat['day_request'].apply(lambda x: 0 if x<16 else 1)
    feat_mat['season_request']=pd.to_datetime(train_data['unix_timestamp_of_request_utc'],unit = 's').dt.month
    feat_mat['season_request']=feat_mat['season_request'].apply(ts_to_season)
    feat_mat['is_spring']=feat_mat['season_request'].apply(lambda x: 1 if x=='spring' else 0)
    feat_mat['is_summer']=feat_mat['season_request'].apply(lambda x: 1 if x=='summer' else 0)
    feat_mat['is_fall']=feat_mat['season_request'].apply(lambda x: 1 if x=='fall' else 0)
    feat_mat['is_winter']=feat_mat['season_request'].apply(lambda x: 1 if x=='winter' else 0)
    del feat_mat['season_request']
    
    #check if requester was a previous giver
    #feat_mat['was_giver']=train_data['requester_username'].apply(lambda x: 1 if x in previous_givers else 0)
    
    feat_mat['first_post']=np.log(train_data['requester_days_since_first_post_on_raop_at_request']+1)
    feat_mat['upvotes_minus_downvotes']=train_data['requester_upvotes_minus_downvotes_at_request']
    feat_mat['upvotes_plus_downvotes_at_request']=np.log(train_data['requester_upvotes_plus_downvotes_at_request']+1)
    upvotes=train_data.apply(lambda row: (row['requester_upvotes_plus_downvotes_at_request'] + row['requester_upvotes_minus_downvotes_at_request'])/2,axis=1)
    downvotes=train_data.apply(lambda row: (row['requester_upvotes_plus_downvotes_at_request']- row['requester_upvotes_minus_downvotes_at_request'])/2,axis=1)
    feat_mat['upvotes']=upvotes
    feat_mat['vote_ratio']=upvotes/(upvotes+downvotes+1)
    
    feat_mat['req_age']=np.log(train_data['requester_account_age_in_days_at_request']+1)
    feat_mat['num_subs']=np.log(train_data['requester_number_of_subreddits_at_request']+1)
    feat_mat['num_posts']=np.log(train_data['requester_number_of_posts_at_request']+1)
    feat_mat['pizza_activity']=np.log(train_data['requester_number_of_posts_on_raop_at_request']+1)
    feat_mat['len_request']=np.log(train_data['request_text_edit_aware'].apply(len)+1)
    feat_mat['len_title']=np.log(train_data['request_title'].apply(len)+1)
    #feat_mat['len_name']=train_data['requester_username'].apply(len)
    feat_mat['pizza_comments']=np.log(train_data['requester_number_of_comments_in_raop_at_request']+1)
    # reciprocity indicator
    #feat_mat['reciprocity'] = train_data['request_text_edit_aware'].apply(lambda x:1 if re.search("repay|pay.+back|pay.+forward|return.+favor", x) 
                                               #else 0)
    #feat_mat['image_in_text'] = train_data['request_text_edit_aware'].str.contains('imgur.com|.jpg|.png|.jpeg', case=False).apply(lambda x: 1 if x else 0)
    #feat_mat['politeness'] = train_data['request_text_edit_aware'].apply(lambda x: 1 if re.search("thank|appreciate|advance", x) else 0)
    feat_mat['reciprocity'] = train_data['full_text'].apply(lambda x:1 if re.search("repay|pay.+back|pay.+forward|return.+favor", x) 
                                               else 0)
    feat_mat['image_in_text'] = train_data['full_text'].str.contains('imgur.com|.jpg|.png|.jpeg', case=False).apply(lambda x: 1 if x else 0)
    feat_mat['politeness'] = train_data['full_text'].apply(lambda x: 1 if re.search("thank|appreciate|advance", x) else 0)
    
    return feat_mat

In [388]:
def find_narr(narr,s):
    ct=0
    for word in narr:
        ct+=s.split().count(word)
    return ct/len(s.split())

In [389]:
feat_mat=construct_ft_mat(train_data)

dev_mat=construct_ft_mat(dev_data)




In [390]:
train_text_clean=train_data['full_text'].apply(lambda s:pre_proccess(s))
dev_text_clean=dev_data['full_text'].apply(lambda s: pre_proccess(s))     
for n in narratives:
    feat_mat[n[0]]=train_text_clean.apply(lambda s: find_narr(n,s))
    dev_mat[n[0]]=dev_text_clean.apply(lambda s: find_narr(n,s))
    

t_mat=feat_mat.as_matrix()

d_mat=dev_mat.as_matrix()
   

In [426]:
vectorizer = TfidfVectorizer(min_df=5,ngram_range=(1,2), preprocessor=pre_proccess,stop_words='english',norm='l2',sublinear_tf=True) 
train_bag_of_words = vectorizer.fit_transform(train_data['full_text'])
dev_bag_of_words = vectorizer.transform(dev_data['full_text'])

In [427]:
'''rf=RF()
#from sklearn.preprocessing import StandardScaler
#S=StandardScaler()
#clf=LogisticRegression()
r_parameters = {'n_estimators':[64,100,128],'criterion':['gini','entropy'],'random_state':[42],'max_depth':[None,1,3,5,7],'max_features':[None,'auto'],'min_samples_leaf':[1,3,5]}
clf=GridSearchCV(rf, r_parameters,scoring='roc_auc')
clf.fit(t_mat,train_labels)
preds=clf.predict(d_mat)
roc_auc_score(dev_labels, preds, average='micro')'''

"rf=RF()\n#from sklearn.preprocessing import StandardScaler\n#S=StandardScaler()\n#clf=LogisticRegression()\nr_parameters = {'n_estimators':[64,100,128],'criterion':['gini','entropy'],'random_state':[42],'max_depth':[None,1,3,5,7],'max_features':[None,'auto'],'min_samples_leaf':[1,3,5]}\nclf=GridSearchCV(rf, r_parameters,scoring='roc_auc')\nclf.fit(t_mat,train_labels)\npreds=clf.predict(d_mat)\nroc_auc_score(dev_labels, preds, average='micro')"

In [573]:
lsvc = LSVC(C=.85, penalty="l1", dual=False,random_state=42).fit(train_bag_of_words,train_labels)
model = SelectFromModel(lsvc, prefit=True)
#S=StandardScaler(with_mean=False)
#X_new = model.transform(S.fit_transform(train_bag_of_words))
X_new = model.transform(train_bag_of_words)
print(X_new.shape)

#d_new=model.transform(S.transform(dev_bag_of_words))
d_new=model.transform(dev_bag_of_words)


f_new=hstack([X_new,t_mat])
dev_new=hstack([d_new,d_mat])

(3232, 873)


In [520]:
vectorizer_lda = CountVectorizer(min_df=10,ngram_range=(1,1), preprocessor=pre_proccess,stop_words='english') 
lda_bag_of_words = vectorizer_lda.fit_transform(train_data['full_text'])
lda_devbag_of_words = vectorizer_lda.transform(dev_data['full_text'])

In [521]:
from sklearn.decomposition import LatentDirichletAllocation as LDA
lda = LDA(n_components=3, learning_method="batch", max_iter=30,learning_decay=.7, random_state=42)
train_topics = lda.fit_transform(lda_bag_of_words)
print(lda.components_.shape)

dev_topics=lda.transform(lda_devbag_of_words)


f_new=np.hstack([t_mat,train_topics])
dev_new=np.hstack([d_mat,dev_topics])

(3, 1580)


In [574]:
f_new=hstack([X_new,t_mat,train_topics])
dev_new=hstack([d_new,d_mat,dev_topics])

In [447]:
'''pca=PCA(n_components=5,random_state=42)
word_compresst=pca.fit_transform(t_mat)
word_compresstest=pca.transform(d_mat)

f_new=hstack([X_new,t_mat])
dev_new=hstack([d_new,d_mat])'''

In [429]:
'''nb=BernoulliNB()
#nb=MultinomialNB()
n_parameters = {'alpha': np.linspace(0.01, 10, 100),'binarize':np.linspace(0.0, 1, 10)}
clf2 = GridSearchCV(nb, n_parameters,scoring='roc_auc',cv=5)
clf2.fit(f_new,train_labels)
preds=clf2.predict(dev_new)
print(clf2.best_params_)
print(roc_auc_score(dev_labels, preds, average='micro'))'''


"nb=BernoulliNB()\n#nb=MultinomialNB()\nn_parameters = {'alpha': np.linspace(0.01, 10, 100),'binarize':np.linspace(0.0, 1, 10)}\nclf2 = GridSearchCV(nb, n_parameters,scoring='roc_auc',cv=5)\nclf2.fit(f_new,train_labels)\npreds=clf2.predict(dev_new)\nprint(clf2.best_params_)\nprint(roc_auc_score(dev_labels, preds, average='micro'))"

In [430]:
'''from sklearn.ensemble import GradientBoostingClassifier
GBC = GradientBoostingClassifier(
    n_estimators=500, 
    learning_rate=0.01, 
    max_depth=4, 
    random_state=123)

GBC.fit(f_new,train_labels)
preds=GBC.predict(dev_new)
roc_auc_score(dev_labels, preds, average='micro')'''

"from sklearn.ensemble import GradientBoostingClassifier\nGBC = GradientBoostingClassifier(\n    n_estimators=500, \n    learning_rate=0.01, \n    max_depth=4, \n    random_state=123)\n\nGBC.fit(f_new,train_labels)\npreds=GBC.predict(dev_new)\nroc_auc_score(dev_labels, preds, average='micro')"

In [431]:
'''lsvm=LSVC(penalty='l2',dual=True)
l_param={'C':np.linspace(0.001,1,100),'class_weight':['balanced',None]}
csvm=GridSearchCV(lsvm,l_param,scoring='roc_auc',cv=5)
S=StandardScaler(with_mean=False)
csvm.fit(S.fit_transform(f_new),train_labels)
preds=csvm.predict(S.transform(dev_new))
roc_auc_score(dev_labels, preds, average='micro')'''

"lsvm=LSVC(penalty='l2',dual=True)\nl_param={'C':np.linspace(0.001,1,100),'class_weight':['balanced',None]}\ncsvm=GridSearchCV(lsvm,l_param,scoring='roc_auc',cv=5)\nS=StandardScaler(with_mean=False)\ncsvm.fit(S.fit_transform(f_new),train_labels)\npreds=csvm.predict(S.transform(dev_new))\nroc_auc_score(dev_labels, preds, average='micro')"

In [603]:
import xgboost as xgb
#create dmatrices
dtrain = xgb.DMatrix(f_new, train_labels)
dtest = xgb.DMatrix(dev_new
                         , dev_labels)

#booster parameter
param = {'max_depth':15, 'eta': .015, 'silent': 1, 'objective': 'binary:logistic'
         , 'scale_pos_weight': 3.06,'max_delta_step':1,'subsample':.9,'seed':42}#9 depth if sublin false
param['nthread'] = 4
param['eval_metric'] = 'auc'

#specify validation set to watch performance
evallist = [(dtest, 'eval'), (dtrain, 'train')]

#train model
num_round = 100
bst = xgb.train(param.items(), dtrain, num_round, evallist)


[0]	eval-auc:0.547878	train-auc:0.83709
[1]	eval-auc:0.602936	train-auc:0.930538
[2]	eval-auc:0.62236	train-auc:0.952502
[3]	eval-auc:0.621624	train-auc:0.955419
[4]	eval-auc:0.625415	train-auc:0.957615
[5]	eval-auc:0.628228	train-auc:0.957443
[6]	eval-auc:0.625498	train-auc:0.960893
[7]	eval-auc:0.631743	train-auc:0.965212
[8]	eval-auc:0.630839	train-auc:0.974724
[9]	eval-auc:0.636509	train-auc:0.975798
[10]	eval-auc:0.637229	train-auc:0.978529
[11]	eval-auc:0.640691	train-auc:0.982643
[12]	eval-auc:0.643705	train-auc:0.983878
[13]	eval-auc:0.642286	train-auc:0.984687
[14]	eval-auc:0.647578	train-auc:0.985296
[15]	eval-auc:0.645646	train-auc:0.985778
[16]	eval-auc:0.649005	train-auc:0.987255
[17]	eval-auc:0.651447	train-auc:0.986858
[18]	eval-auc:0.656924	train-auc:0.988852
[19]	eval-auc:0.655896	train-auc:0.988894
[20]	eval-auc:0.655699	train-auc:0.98926
[21]	eval-auc:0.654856	train-auc:0.989358
[22]	eval-auc:0.654947	train-auc:0.990026
[23]	eval-auc:0.655666	train-auc:0.990416
[24]	

In [533]:
test_mat=construct_ft_mat(test_json_df)
test_mat['full_text']=test_data['request_text_edit_aware'] +' '+test_data['request_title']
test_text_clean=test_data['full_text'].apply(lambda s: pre_proccess(s))
for n in narratives:
    test_mat[n[0]]=test_text_clean.apply(lambda s: find_narr(n,s))
    
lda_testbag_of_words = vectorizer_lda.transform(test_data['full_text'])    
test_labels = test_json_df.requester_received_pizza.astype(int).as_matrix()

test_bag_of_words = vectorizer.fit_transform(test_data['full_text'])


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=10,
        ngram_range=(1, 1),
        preprocessor=<function pre_proccess at 0x10efe5620>,
        stop_words='english', strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)