# HACKTHON 4

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import string

from tqdm import tqdm
from collections import Counter
from spacy.matcher import Matcher
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from nltk.tokenize import WordPunctTokenizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

import os
import re
import nltk
import spacy

cpu_count = int(os.cpu_count()) if os.cpu_count() != None else 4

In [2]:
%matplotlib inline
warnings.simplefilter("ignore")

## 1 - EXTRA Pipeline

In [3]:
df_train = pd.read_csv('./data/book_review_labelled_data.csv')
df_train.drop(["overall"], axis=1,inplace=True)
df_train.head(3)

Unnamed: 0,reviewerID,reviewerName,reviewText,summary,reviewTime,rates_count,helpful_count,rating
0,A3UPFTGAWZ3G2R,David J. Loftus,"Jenkins, a history professor and Member of Par...","Quite readable, nicely done","12 6, 2001",40,37,4
1,A1XTKTLNSCRLDS,Ellen Rappaport,Detective Inspector Erlendur Sveinsson is at h...,Mesmerizing in depth,"02 23, 2014",0,0,5
2,A1A77B6DQQH436,"crescamp ""esc""",I didn't read this. I purchased it for a gift...,10-minute life lessons for kids,"02 12, 2013",3,0,3


In [4]:
df_test = pd.read_csv('./data/book_review_test_data_unlabelled.csv')
df_test.head(3)


Unnamed: 0,reviewerID,reviewerName,reviewText,summary,reviewTime,rating
0,A2HESNQJZ9OB7H,Jen,So boring and stupid had a hard time finishing...,Unbelievable.,"02 16, 2014",1
1,A1ABXPSFA9PC8N,Ben Parker,Ill be the first to admit i'm not the best coo...,Easy and Clear Cooking,"11 7, 2012",5
2,AYVW3O6W8S5S4,Johnny in Texas,Doesn't tell you how to do anything... just s...,not bad,"02 25, 2014",3


# LABEL

In [5]:
def create_train_set(df, threshold=0.8):
    df_=df.copy()
    df_['label_aux'] = df_['helpful_count']/df_['rates_count']
    df_['label'] = (df_['label_aux'] >= threshold)
    df_.drop(["label_aux"], axis=1,inplace=True)
    df_['label'].value_counts(normalize=True)
    df_=df_[df_['rates_count']>0]
    
    return df_

In [6]:
print(df_train.shape)
df_train=create_train_set(df_train, 0.8)
print(df_train.shape)
df_train.head(3)

(49992, 8)
(28423, 9)


Unnamed: 0,reviewerID,reviewerName,reviewText,summary,reviewTime,rates_count,helpful_count,rating,label
0,A3UPFTGAWZ3G2R,David J. Loftus,"Jenkins, a history professor and Member of Par...","Quite readable, nicely done","12 6, 2001",40,37,4,True
2,A1A77B6DQQH436,"crescamp ""esc""",I didn't read this. I purchased it for a gift...,10-minute life lessons for kids,"02 12, 2013",3,0,3,False
3,AEAF4MRYHJZI,"Angelia Menchan ""acvermen.blogspot.com""",Fierce Angels by Sheri Park reads like a disse...,So FIERCE,"03 24, 2010",9,9,4,True


# EXTRA Features

In [7]:
def create_extra_features(df):
    df_ = df.copy()
    
    stop_words = nltk.corpus.stopwords.words('english')
    
    df_['reviewText_lower'] = df_['reviewText'].str.lower()
    df_['reviewText_lower_no_punctuation'] = df_['reviewText_lower'].apply(lambda x: re.sub(r'[^\w\s]','', x))
    df_['nb_words'] = df_['reviewText_lower_no_punctuation'].apply(lambda x: len(x.split()))
    df_['nb_stopwords'] = df_['reviewText_lower_no_punctuation'].apply(lambda x: len([word for word in x.split() if word in stop_words]))
    df_['avg_word_length'] = df_['reviewText_lower_no_punctuation'].apply(lambda x: np.mean([len(t) for t in x.split()]) if np.mean([len(t) for t in x.split()  if t not in stop_words]) > 0 else 0)
    tokenizer = WordPunctTokenizer()
    df_['reviewText_tokenized'] = df_['reviewText'].apply(lambda x: ' '.join(tokenizer.tokenize(x)))
    df_['nb_punctuation'] = df_['reviewText_tokenized'].apply(lambda x: len([punct for punct in x.split() if punct in string.punctuation]))
    df_['nb_punctuation_normalized'] = df_['nb_punctuation']/df_['nb_words']
    df_['nb_stopwords_normalized'] = df_['nb_stopwords']/df_['nb_words']
    
    df_ = df_.drop(['reviewText_lower', 'reviewText_lower_no_punctuation', 'reviewText_tokenized'], axis=1)
    new_cols = ['nb_words', 'nb_stopwords', 'avg_word_length', 'nb_punctuation', 'nb_punctuation_normalized', 'nb_stopwords_normalized']
    return df_, new_cols

In [10]:
df_train_processed, new_cols = create_extra_features(df_train)
df_test_processed, new_cols = create_extra_features(df_test)

In [11]:
new_cols

['nb_words',
 'nb_stopwords',
 'avg_word_length',
 'nb_punctuation',
 'nb_punctuation_normalized',
 'nb_stopwords_normalized']

## 1.3 - NLP

In [12]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe("merge_entities", after="ner")
en_stopwords = nlp.Defaults.stop_words

#### POS-Tagging Search

In [None]:
docs_train = list(tqdm(nlp.pipe(df_train["reviewText"], batch_size=20, n_process=cpu_count-1), total=len(df_train["reviewText"])))
docs_test = list(tqdm(nlp.pipe(df_test["reviewText"], batch_size=20, n_process=cpu_count-1), total=len(df_test["reviewText"])))

  3%|▎         | 983/28423 [00:41<25:23, 18.02it/s] 

In [45]:
def df_define(df):
    
    df_=df.copy()
    
    #if docs is None:
    docs_test = list(tqdm(nlp.pipe(df_["reviewText"], batch_size=20, n_process=cpu_count-1), total=len(df_["reviewText"])))

    ## Add adjectives and adverbs count
    matcher = Matcher(nlp.vocab)
    pattern = [[{'POS': 'ADJ'}], [{'POS': 'ADV'}]]
    matcher.add('LOC', pattern)


    nb_adj_adv = []
    for i, doc in enumerate(docs):
        matches = matcher(doc)
        count = 0
        for match_id, start, end in matches:
            span = doc[start:end]  # the matched span
            count +=1
        nb_adj_adv.append(count)
    
    df_["nb_adj_adv"] = nb_adj_adv
    
    
    df_["nb_words"] = df_['reviewText'].str.split().map(len)
    df_["doc_length"] = df_['reviewText'].map(len)
    df_["avg_word_length"] = df_['reviewText'].apply(lambda x: np.mean([len(t) for t in x.split()]) if len([len(t) for t in x.split(' ')]) > 0 else 0).fillna(0)
    
    return df_

In [21]:
df_train_processed = df_define(df=df_train, docs= docs_train)
df_train_processed.head(3)

df_test_processed = df_define(df=df_test, docs= docs_test)
df_test_processed.head(3)

Unnamed: 0,reviewerID,reviewerName,reviewText,summary,reviewTime,rating,nb_adj_adv,nb_words,doc_length,avg_word_length
0,A2HESNQJZ9OB7H,Jen,So boring and stupid had a hard time finishing...,Unbelievable.,"02 16, 2014",1,25,119,634,4.277311
1,A1ABXPSFA9PC8N,Ben Parker,Ill be the first to admit i'm not the best coo...,Easy and Clear Cooking,"11 7, 2012",5,12,82,435,4.280488
2,AYVW3O6W8S5S4,Johnny in Texas,Doesn't tell you how to do anything... just s...,not bad,"02 25, 2014",3,7,38,203,4.289474


### Features Unions

In [20]:
class Selector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a column from the dataframe to perform additional transformations on
    """ 
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self
    

class TextSelector(Selector):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def transform(self, X):
        return X[self.key]
    
    
class NumberSelector(Selector):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def transform(self, X):
        return X[[self.key]]
    
    

# Pipeline

In [23]:
df_train_processed.head(3)

Unnamed: 0,reviewerID,reviewerName,reviewText,summary,reviewTime,rates_count,helpful_count,rating,label,nb_adj_adv,nb_words,doc_length,avg_word_length
0,A3UPFTGAWZ3G2R,David J. Loftus,"Jenkins, a history professor and Member of Par...","Quite readable, nicely done","12 6, 2001",40,37,4,True,49,258,1790,5.94186
2,A1A77B6DQQH436,"crescamp ""esc""",I didn't read this. I purchased it for a gift...,10-minute life lessons for kids,"02 12, 2013",3,0,3,False,2,25,117,3.68
3,AEAF4MRYHJZI,"Angelia Menchan ""acvermen.blogspot.com""",Fierce Angels by Sheri Park reads like a disse...,So FIERCE,"03 24, 2010",9,9,4,True,41,272,1557,4.724265


In [36]:
from pandas.api.types import is_numeric_dtype

def df_FeatureUnion(df, columns):
    df_=df.copy()
    for col in columns:
        if is_numeric_dtype(df_[col]):
            globals()[col+'_pipe'] = Pipeline([
                ('selector', NumberSelector(key=col)),
                ('standard', StandardScaler())
                ])
        else:
            globals()[col+'_pipe'] = Pipeline([
                ('selector', TextSelector(key=col)),
                ('tfidf', TfidfVectorizer())
                ])
    
    feats = FeatureUnion([('reviewText', reviewText_pipe), 
                          ('nb_adj_adv', nb_adj_adv_pipe),
                          ('nb_words', nb_words_pipe),
                          ('doc_length', doc_length_pipe),
                          ('avg_word_length', avg_word_length_pipe),
                          ('rating', avg_word_length_pipe)])    
    
    feature_processing = Pipeline([('feats', feats)])
    
    return feats    

In [38]:
feats = df_FeatureUnion(df_train_processed, ['rating', 'reviewText','nb_adj_adv','nb_words', 'doc_length', 'avg_word_length'])


In [39]:
def improved_pipeline(feats, X_train, X_val, y_train, y_val, X_test):
    """
    Train a Random Forest using sklearn's Pipeline and return the trained model and its accuracy in the test set.
    Don't forget to add the feats to the Pipeline!
    """
    
    pipe = Pipeline([
        ('features',feats),
        ('classifier', RandomForestClassifier()),
    ])
    
    pipe.fit(X_train, y_train)
    y_val_preds = pipe.predict(X_val)
    y_test_preds = pipe.predict(X_test)
    val_f1_score = f1_score(y_val, y_val_pred)

    return pipe, val_f1_score, y_test_preds

In [23]:
Y = df_processed["label"]
X = df_processed.drop(columns="label")

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)
pipeline_model, pipeline_acc, pipeline_prediction = improved_pipeline(feats, X_train, X_test, y_train, y_test)

pipeline_acc

184      ham
2171     ham
5422     ham
4113     ham
4588     ham
        ... 
1932     ham
5316     ham
2308    spam
1903    spam
763      ham
Name: label, Length: 4457, dtype: object


0.9757847533632287