## Model Building

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
import string
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import xgboost as xgb

In [None]:
#load data
df = pd.read_table('reviews.csv', header=0, error_bad_lines=False, delimiter='|')

#split into train and test
train, test = train_test_split(df, train_size = 0.7, random_state = 111, stratify=df.label)

In [None]:
#remove duplicate
train = train.drop_duplicates()

#reset indices
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [None]:
#TOKENIZATION

#regular expressions replacer
import re
replacement_patterns = [
#reductions
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would'),
#auotes at the beginig and end
(r'^["\'](.+)["\']$','\g<1>'),
#identity nouns - names of films and actors
(r'["\'].+?["\']',''),
(r'(\w+)\s+[A-Z][a-z]+','\g<1>'),
(r'(\w+)\s+[A-Z][a-z]+','\g<1>'),
#removing brackets
(r'[\[\(\)\]]',''),
#removing dates
(r' [\']?\d{2,4}s? ',' ')
]

def replace(text):
    for (pattern, repl) in replacement_patterns:
        text = re.sub(pattern, repl, text)
    return text

#tokenizer
stopwords_list = [
    #pronouns
    'i','me','my','mine','myself',
    'you','your','yours','yourself',
    'he','him','his','himself',
    'she','her','hers','herself',
    'it','its','itself',
    'we','us','our','ours','ourselves',
    'you','your','yours','yourselves',
    'they','them','their','theirs','themselves',
    #acticles
    'a','an',
    #forms of be
    'is','am','are','was','were','will','be',

    'to'
    ]
    
               
def tokenize_text(text):
    replaced = replace(text).lower()
    words = word_tokenize(replaced)
    #words = [LancasterStemmer().stem(w) for w in words]
    words = [SnowballStemmer('english').stem(w) for w in words]
    #words = [PorterStemmer().stem(w) for w in words]
    #words = [WordNetLemmatizer().lemmatize(w) for w in words]
    #words = [WordNetLemmatizer().lemmatize(w,pos='v') for w in words]
    words = [word for word in words if word not in stopwords_list]
    return words

#vectorization
def build_feature_matrices(X):
    vectorizer = CountVectorizer(tokenizer=tokenize_text, 
                                 max_features=4000, ngram_range=(1,2))
    X_transform = vectorizer.fit_transform(X).toarray()
    features_voc = vectorizer.get_feature_names()
    return X_transform, features_voc



In [None]:
#NEW FEATURES

#review length
def review_length(X):
    symbol_length = list()
    for i in range(len(X)):
        length = len(X[i])
        symbol_length.append(length)
    return symbol_length

In [None]:
X_train_transform, features = build_feature_matrices(train['text'])
X_train_transform = np.insert(X_train_transform, 0, review_length(train.text), axis=1)

In [None]:
#XGBOOST

#connvert data to xgb format
dtrain = xgb.DMatrix(X_train_transform,label=train['label'])

#set parameters
param = {
        'objective':'binary:logistic'
        }

#train model
num_round = 10
bst = xgb.train(param, dtrain, num_round)

## Model Testing

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
import string
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [None]:
#TOKENIZATION

#regular expressions replacer
import re
replacement_patterns = [
#reductions
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would'),
#auotes at the beginig and end
(r'^["\'](.+)["\']$','\g<1>'),
#identity nouns - names of films and actors
(r'["\'].+?["\']',''),
(r'(\w+)\s+[A-Z][a-z]+','\g<1>'),
(r'(\w+)\s+[A-Z][a-z]+','\g<1>'),
#removing brackets
(r'[\[\(\)\]]',''),
#removing dates
(r' [\']?\d{2,4}s? ',' ')
]

def replace(text):
    for (pattern, repl) in replacement_patterns:
        text = re.sub(pattern, repl, text)
    return text

#tokenizer
stopwords_list = [
    #pronouns
    'i','me','my','mine','myself',
    'you','your','yours','yourself',
    'he','him','his','himself',
    'she','her','hers','herself',
    'it','its','itself',
    'we','us','our','ours','ourselves',
    'you','your','yours','yourselves',
    'they','them','their','theirs','themselves',
    #acticles
    'a','an',
    #forms of be
    'is','am','are','was','were','will','be',

    'to'
    ]
                
def tokenize_text(text):
    replaced = replace(text).lower()
    words = word_tokenize(replaced)
    #words = [LancasterStemmer().stem(w) for w in words]
    words = [SnowballStemmer('english').stem(w) for w in words]
    #words = [PorterStemmer().stem(w) for w in words]
    #words = [WordNetLemmatizer().lemmatize(w) for w in words]
    #words = [WordNetLemmatizer().lemmatize(w,pos='v') for w in words]
    words = [word for word in words if word not in stopwords_list]
    return words

def build_feature_matrices_test(X):
    # vectorize using loaded features
    vectorizer = CountVectorizer(tokenizer=tokenize_text, vocabulary = features)
    #vectorizer = TfidfVectorizer(tokenizer=tokenize_text, vocabulary = features_voc)
    X_transform = vectorizer.fit_transform(X).toarray()
    return X_transform

In [None]:
#NEW FEATURES

#review length
def review_length(X):
    symbol_length = list()
    for i in range(len(X)):
        length = len(X[i])
        symbol_length.append(length)
    return symbol_length

In [None]:
X_test_transform = build_feature_matrices_test(test['text'])
X_test_transform = np.insert(X_test_transform, 0, review_length(test.text), axis=1)

In [None]:
#XGBOOST

#connvert data to xgb format
dtest = xgb.DMatrix(X_test_transform,label=test['label'])

predictions_prob = bst.predict(dtest)
predictions = [1 if x > 0.55 else 0 for x in predictions_prob]

accuracy_score(predictions, test['label'])