## Sentiment analysis

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
import string
from scipy import sparse
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [2]:
#load data
df = pd.concat([pd.read_csv("reviews_rt.csv", sep="|"), 
        pd.read_csv("imdb_small.csv", sep="|")], ignore_index=True, copy=False)
df = shuffle(df)
df = df.reset_index(drop=True)
print('Dataset loaded.')
print('Dataset has the following number of positive/negative reviews:')
print(df.label.value_counts())

Dataset loaded.
Dataset has the following number of positive/negative reviews:
1    89658
0    62952
Name: label, dtype: int64


In [3]:
#split into train and test
train, test = train_test_split(df, train_size = 0.7, random_state = 111, stratify=df.label)
del df

In [4]:
#remove duplicate
train = train.drop_duplicates()

#reset indices
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
print('Train and test sets are ready.')
print('Train set has',train.shape[0],'observations.')
print('Test set has',test.shape[0],'observations.')

Train and test sets are ready.
Train set has 106436 observations.
Test set has 45783 observations.


In [5]:
#regular expressions replacer
import re
replacement_patterns = [
#reductions
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would'),
#auotes at the beginig and end
(r'^["\'](.+)["\']$','\g<1>'),
#identity nouns - names of films and actors
(r'["\'].+?["\']',''),
(r'(\w+)\s+[A-Z][a-z]+','\g<1>'),
(r'(\w+)\s+[A-Z][a-z]+','\g<1>'),
#removing brackets
(r'[\[\(\)\]]',''),
#removing dates
(r' [\']?\d{2,4}s? ',' ')
]

def replace(text):
    for (pattern, repl) in replacement_patterns:
        text = re.sub(pattern, repl, text)
    return text

In [6]:
#tokenizer
stopwords_list = stopwords.words('english')  
def tokenize_text(text):
    replaced = replace(text).lower()
    words = word_tokenize(replaced)
    words = [SnowballStemmer('english').stem(w) for w in words]
    words = [word for word in words if word not in stopwords_list]
    return words

#vectorization
def build_feature_matrices(X):
    vectorizer = CountVectorizer(tokenizer=tokenize_text, max_features=6000, ngram_range=(1,2))
    X_transform = vectorizer.fit_transform(X).toarray()
    features_voc = vectorizer.get_feature_names()
    return X_transform, features_voc

def build_feature_matrices_test(X):
    vectorizer = CountVectorizer(tokenizer=tokenize_text, vocabulary = features)
    X_transform = vectorizer.fit_transform(X).toarray()
    return X_transform

In [7]:
#Train set transformation
print('Train set:')
print('Vectorizing set...')
X_train_transform, features = build_feature_matrices(train['text'])

#NEW FEATURES
print('Extracting features...')
#review length
X_train_transform = np.insert(X_train_transform, 0, train.text.apply(lambda s: len(s)), axis=1)
#Number of exclamation marks
X_train_transform = np.insert(X_train_transform, 0, train.text.apply(lambda s: len([x for x in s if x=='!'])), axis=1)
#Number of questions marks
X_train_transform = np.insert(X_train_transform, 0, train.text.apply(lambda s: len([x for x in s if x=='?'])), axis=1)
#Number of tokens
X_train_transform = np.insert(X_train_transform, 0, train.text.apply(lambda s: len(word_tokenize(s))), axis=1)
#Number of sentences
X_train_transform = np.insert(X_train_transform, 0, train.text.apply(lambda s: len(sent_tokenize(s))), axis=1)
#sentence polarity
X_train_transform = np.insert(X_train_transform, 0, 
            train.text.apply(lambda s: TextBlob(sent_tokenize(s)[0]).sentiment.polarity), axis=1)
X_train_transform = np.insert(X_train_transform, 0, 
            train.text.apply(lambda s: TextBlob(sent_tokenize(s)[-1]).sentiment.polarity), axis=1)

print("X matrix has", X_train_transform.shape[1], "variables.")

#to sparse
X_train_transform = sparse.csr_matrix(X_train_transform)

Train set:
Vectorizing set...
Extracting features...
X matrix has 6007 variables.


In [8]:
#Test set transformation
print('Test set:')
print('Vectorizing set...')
X_test_transform = build_feature_matrices_test(test['text'])

#NEW FEATURES
print('Extracting features...')
X_test_transform = np.insert(X_test_transform, 0, test.text.apply(lambda s: len(s)), axis=1)
X_test_transform = np.insert(X_test_transform, 0, test.text.apply(lambda s: len([x for x in s if x=='!'])), axis=1)
X_test_transform = np.insert(X_test_transform, 0, test.text.apply(lambda s: len([x for x in s if x=='?'])), axis=1)
X_test_transform = np.insert(X_test_transform, 0, test.text.apply(lambda s: len(word_tokenize(s))), axis=1)
X_test_transform = np.insert(X_test_transform, 0, test.text.apply(lambda s: len(sent_tokenize(s))), axis=1)
X_test_transform = np.insert(X_test_transform, 0, 
            test.text.apply(lambda s: TextBlob(sent_tokenize(s)[0]).sentiment.polarity), axis=1)
X_test_transform = np.insert(X_test_transform, 0, 
            test.text.apply(lambda s: TextBlob(sent_tokenize(s)[-1]).sentiment.polarity), axis=1)

print("X matrix has", X_test_transform.shape[1], "variables.")

#to sparse
X_test_transform = sparse.csr_matrix(X_test_transform)

Test set:
Vectorizing set...
Extracting features...
X matrix has 6007 variables.


In [9]:
#connvert data to xgb format
dtrain = xgb.DMatrix(X_train_transform,label=train['label'])
dtest = xgb.DMatrix(X_test_transform[25000:],label=test['label'][25000:])
dval = xgb.DMatrix(X_test_transform[:25000],label=test['label'][:25000])
del X_train_transform,X_test_transform
print("Datasets were converted to xgb format.")

Datasets were converted to xgb format.


In [16]:
#set parameters
param = {
        'eta': 0.2,
        'objective':'binary:logistic',
        'max_depth':15,
        'seed':123,
        'eval_metric':'error',
        'min_child_weight':7
        }

In [18]:
#train and evaluation set
evallist  = [(dval,'eval'), (dtrain,'train')]

#train model
num_round = 1000
print("Model training...")
bst = xgb.train(param, dtrain, num_round, evallist)
print("Done.")

Model training...
[0]	eval-error:0.31752	train-error:0.301289
[1]	eval-error:0.30932	train-error:0.292223
[2]	eval-error:0.30556	train-error:0.286059
[3]	eval-error:0.30404	train-error:0.283748
[4]	eval-error:0.3014	train-error:0.280244
[5]	eval-error:0.2994	train-error:0.278581
[6]	eval-error:0.2946	train-error:0.272389
[7]	eval-error:0.29372	train-error:0.270003
[8]	eval-error:0.29096	train-error:0.266348
[9]	eval-error:0.289	train-error:0.262073
[10]	eval-error:0.28628	train-error:0.259602
[11]	eval-error:0.28352	train-error:0.255365
[12]	eval-error:0.281	train-error:0.252903
[13]	eval-error:0.2792	train-error:0.249681
[14]	eval-error:0.2772	train-error:0.247557
[15]	eval-error:0.276	train-error:0.245622
[16]	eval-error:0.27412	train-error:0.243451
[17]	eval-error:0.27216	train-error:0.242268
[18]	eval-error:0.27116	train-error:0.240605
[19]	eval-error:0.27076	train-error:0.238716
[20]	eval-error:0.26936	train-error:0.23773
[21]	eval-error:0.2674	train-error:0.235193
[22]	eval-error

In [None]:
#parameter tuning (example for eta)
#gridsearch = {}
#param_range = [0.2, 0.25]
#for parameter in param_range:
#    evallist  = [(dval,'eval'), (dtrain,'train')]
#    param['eta']=parameter
#    bst = xgb.train(param, dtrain, num_round, evallist )
#    predictions = [1 if x > 0.5 else 0 for x in bst.predict(dval)]
#    score = accuracy_score(predictions, test['label'][:10000])
#    gridsearch[parameter]=score

#gridsearch = [(value, key) for key, value in gridsearch.items()]
#print("Best parameter: ", max(gridsearch)[1])
#print("Best accuracy: ", max(gridsearch)[0])

In [19]:
#predict
predictions = bst.predict(dtest) 
predictions = [1 if x > 0.5 else 0 for x in predictions] 
score = accuracy_score(predictions, test['label'][25000:])
print("Test set accuracy: ",score)

Test set accuracy:  0.802290333446
