# Sentiment Analysis using BoW with XGBoost
________

### Importing libraries

In [1]:
import re
import os
import numpy as np
import pandas as pd
import xgboost as xgb
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

## Function

##### *_Steps:_*
    #   1. Remove HTML
    #   2. Remove non-letters
    #   3. Convert to lower case
    #   4. Remove stopwords
    #   5. Return space joined texts

In [3]:
def text_cleaning(texts):
    # Initialize an empty list to hold the clean reviews
    clean_text = []
    
    for text in texts:
        text = BeautifulSoup(text, "lxml").get_text() 
        text = re.sub("[^a-zA-Z]", " ", text) 
        text = text.lower().split()                             
        stop_word_list = set(stopwords.words("english"))                  
        text = [word for word in text if not word in stop_word_list] 
        clean_text.append((" ".join(text)))
        
    return clean_text

# Data Exploration

In [4]:
train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=3)

In [11]:
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [13]:
test.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [7]:
print("Train shape:", train.shape, "\nTest shape:", test.shape)

Train shape: (25000, 3) 
Test shape: (25000, 2)


In [9]:
print ("Data format:-")
print("Train -", train.columns, "\nTest -", test.columns)

Data format:-
Train - Index(['id', 'sentiment', 'review'], dtype='object') 
Test - Index(['id', 'review'], dtype='object')


In [22]:
print("Train set sentiment ratio:-\n", train.sentiment.value_counts(dropna=False))

Train set sentiment ratio:-
 1    12500
0    12500
Name: sentiment, dtype: int64


# Data Cleaning

In [24]:
print ("Data cleaning...")
clean_train = text_cleaning(train['review'])
clean_test  = text_cleaning(test['review'])

Data cleaning...


# Feature Building 

In [27]:
print ("Building Bag-of-Words...")
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, max_features = 5000) 
bow_train = (vectorizer.fit_transform(clean_train)).toarray()
bow_test = (vectorizer.transform(clean_test)).toarray()

Building Bag-of-Words...


 # Model Building

In [28]:
print ("Model building - XGB...")

params = {'max_depth': [3,5], 'n_estimators':[50,200]}
xgb_model = xgb.XGBClassifier(objective="binary:logistic",tree_method='gpu_hist', predictor='gpu_predictor')
BOW_XGB = GridSearchCV(xgb_model, param_grid=params, cv=3)

BOW_XGB.fit(bow_train, train.sentiment)

Model building - XGB...


In [46]:
BOW_XGB_scores = cross_val_score(BOW_XGB, bow_train, train.sentiment, cv=3, n_jobs=-1)
print("Averaged CV Accuracy: %0.2f (+/- %0.2f)" % (BOW_XGB_scores.mean(), BOW_XGB_scores.std() * 2))

Averaged CV Accuracy: 0.86 (+/- 0.01)


In [31]:
BOW_XGB.fit(bow_train, train.sentiment)

GridSearchCV(cv=3, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estim...
                                     objective='binary:logistic',
                                     predictor='gpu_predictor',
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
    

In [32]:
result = BOW_XGB.predict(bow_test)

In [38]:
result

array([1, 0, 1, ..., 0, 1, 1], dtype=int64)

In [33]:
submission = pd.DataFrame(data={"id":test["id"], "sentiment":result})
submission.to_csv("BagOfWord-XGB.csv", index=False, quoting=3)
print("Done")

Done


In [44]:
submission.head

<bound method NDFrame.head of                id  sentiment
0      "12311_10"          1
1        "8348_2"          0
2        "5828_4"          1
3        "7186_2"          1
4       "12128_7"          1
...           ...        ...
24995   "2155_10"          1
24996     "59_10"          1
24997    "2531_1"          0
24998    "7772_8"          1
24999  "11465_10"          1

[25000 rows x 2 columns]>