### Loading Required Modules

In [1]:
import pandas as pd
import numpy as np
import os
import re

# =============================================================================
# Web Scraping Utilities
# =============================================================================

import requests
from bs4 import BeautifulSoup

# =============================================================================
# Scikit-Learn Utitlies
# =============================================================================
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


# =============================================================================
# Exploratory Data Analysis
# =============================================================================

import sweetviz as sv


# =============================================================================
# ML Models Utilities
# =============================================================================
import xgboost as xgb
import lightgbm as lgb
import pickle




# =============================================================================
# Tqdm utlities
# =============================================================================
from tqdm import tqdm_notebook, tnrange
from tqdm.auto import tqdm
tqdm.pandas(desc='Progress')


# =============================================================================
# Other Utilities
# =============================================================================

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

pd.set_option('display.max_colwidth', -1)



### Load Datasets

In [165]:
## Train
train_text = open(r"./datasets/sentiment/train_text.txt", "r", encoding="utf-8")
train_labels = open(r"./datasets/sentiment/train_labels.txt", "r", encoding="utf-8")

## Validation
val_text = open(r"./datasets/sentiment/val_text.txt", "r", encoding="utf-8")
val_labels = open(r"./datasets/sentiment/val_labels.txt", "r", encoding="utf-8")

### Test
test_text = open(r"./datasets/sentiment/test_text.txt", "r", encoding="utf-8")
test_labels = open(r"./datasets/sentiment/test_labels.txt", "r", encoding="utf-8")

In [166]:
def load_datasets(text_file, label_file):
    out_ls = []
    for label, text in zip(label_file, text_file):
        i = text.strip().split("\n")
        j = label.strip().split("\n")
        out_ls.append([i[0], j[0]])

    output_df = pd.DataFrame(out_ls, columns = ["full_text", "sentiment"])
    
    return output_df

In [167]:
train_df = load_datasets(train_text, train_labels)
print("There are {} Rows and {} Columns in Train Dataset".format(train_df.shape[0], train_df.shape[1]))
val_df = load_datasets(val_text, val_labels)
print("There are {} Rows and {} Columns in Validation Dataset".format(val_df.shape[0], val_df.shape[1]))
test_df = load_datasets(test_text, test_labels)
print("There are {} Rows and {} Columns in Test Dataset".format(test_df.shape[0], test_df.shape[1]))

There are 45615 Rows and 2 Columns in Train Dataset
There are 2000 Rows and 2 Columns in Validation Dataset
There are 12284 Rows and 2 Columns in Test Dataset


In [169]:
add_train_df = pd.read_excel("./datasets/sentiment/sentiment_neg_v1.xlsx")

In [170]:
train_df["data_flag"] = "TRAIN"
add_train_df["data_flag"] = "TRAIN"
val_df["data_flag"] = "VAL"
test_df["data_flag"] = "TEST"

In [171]:
combined_df = pd.concat([train_df, add_train_df, val_df, test_df], axis=0, ignore_index=True)

In [172]:
combined_df.head()

Unnamed: 0,full_text,sentiment,data_flag
0,"""QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin""",2,TRAIN
1,"""Ben Smith / Smith (concussion) remains out of the lineup Thursday, Curtis #NHL #SJ""",1,TRAIN
2,Sorry bout the stream last night I crashed out but will be on tonight for sure. Then back to Minecraft in pc tomorrow night.,1,TRAIN
3,Chase Headley's RBI double in the 8th inning off David Price snapped a Yankees streak of 33 consecutive scoreless innings against Blue Jays,1,TRAIN
4,"@user Alciato: Bee will invest 150 million in January, another 200 in the Summer and plans to bring Messi by 2017""",2,TRAIN


### Data Preprocessing

In [173]:
from nltk.stem.porter import * 
stemmer = PorterStemmer()
import string

def text_preprocessing(input_txt):
    
    input_txt = str(input_txt)
    ## Remove specific pattern words
    pattern = "@[\w]*"
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    
    input_txt = re.sub(r'http\S+|www\.\S+', '', input_txt)
    
    for punct in '?!.,"$%\'()*+-/:;<=>@[\\]^_`{|}~&' + '“”’':
        input_txt = input_txt.replace(punct, ' ')
    
    input_txt = input_txt.replace("[^a-zA-Z#]", " ") 
    
    if bool(re.search(r'\d', input_txt)):
        input_txt = re.sub('[0-9]', ' ', input_txt)
            
    input_txt = input_txt.replace('"', "")
    input_txt = input_txt.replace("'", "")
    input_txt = input_txt.split()
    
    # Normalise the tokenized words:
    input_txt = " ".join([stemmer.stem(i) for i in input_txt])
    
    return input_txt.lower().strip()

In [174]:
combined_df['sentiment'] = combined_df['sentiment'].astype(int)

In [175]:
combined_df[combined_df['data_flag']=="TRAIN"]['sentiment'].value_counts(normalize=True)

1    0.371723
2    0.320944
0    0.307333
Name: sentiment, dtype: float64

In [176]:
combined_df[combined_df['data_flag']=="TRAIN"]['sentiment'].value_counts()

1    20673
2    17849
0    17092
Name: sentiment, dtype: int64

In [177]:
combined_df["full_cleaned_text"] = combined_df["full_text"].progress_apply(lambda i: text_preprocessing(i))

Progress:   0%|          | 0/69898 [00:00<?, ?it/s]

In [178]:
combined_df.head()

Unnamed: 0,full_text,sentiment,data_flag,full_cleaned_text
0,"""QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin""",2,TRAIN,qt in the origin draft of the th book remu lupin surviv the battl of hogwart #happybirthdayremuslupin
1,"""Ben Smith / Smith (concussion) remains out of the lineup Thursday, Curtis #NHL #SJ""",1,TRAIN,ben smith smith concuss remain out of the lineup thursday curti #nhl #sj
2,Sorry bout the stream last night I crashed out but will be on tonight for sure. Then back to Minecraft in pc tomorrow night.,1,TRAIN,sorri bout the stream last night i crash out but will be on tonight for sure then back to minecraft in pc tomorrow night
3,Chase Headley's RBI double in the 8th inning off David Price snapped a Yankees streak of 33 consecutive scoreless innings against Blue Jays,1,TRAIN,chase headley s rbi doubl in the th inning off david price snap a yanke streak of consecut scoreless inning against blue jay
4,"@user Alciato: Bee will invest 150 million in January, another 200 in the Summer and plans to bring Messi by 2017""",2,TRAIN,alciato bee will invest million in januari anoth in the summer and plan to bring messi by


In [179]:
combined_df[combined_df['data_flag']=="TRAIN"]['sentiment'].value_counts()

1    20673
2    17849
0    17092
Name: sentiment, dtype: int64

### Modeling Data Preparation

In [180]:
model_train_df = combined_df[combined_df["data_flag"]=="TRAIN"][["sentiment", "full_cleaned_text"]]
model_val_df = combined_df[combined_df["data_flag"]=="VAL"][["sentiment", "full_cleaned_text"]]
model_test_df = combined_df[combined_df["data_flag"]=="TEST"][["sentiment", "full_cleaned_text"]]

In [181]:
transform_tfidf = TfidfVectorizer(ngram_range=(1,3), # ngrams - unigram to trigram
                                min_df=3, # removing terms that appear too infrequently
                                max_df=0.9, # removing terms that appear too frequently
                                strip_accents='unicode', # unicode characters
                                use_idf=1, # Enable inverse-document-frequency reweighting
                                smooth_idf=1, # Smooth idf weights by adding one to document frequencies
                                sublinear_tf=1).fit(combined_df["full_cleaned_text"]) # Apply sublinear tf scaling; replace tf with 1 + log(tf).

In [182]:
import pickle
pickle.dump(transform_tfidf, open("twitter_sentiment_tfidf.pkl", "wb"))

In [183]:
# =============================================================================
# Transforming training & testing & validation data
# =============================================================================

train_transform = transform_tfidf.transform(model_train_df["full_cleaned_text"]) # Transforming training data
val_transform = transform_tfidf.transform(model_val_df["full_cleaned_text"]) # Transforming training data
test_transform = transform_tfidf.transform(model_test_df["full_cleaned_text"]) # Transforming validation data

In [184]:
y_train = model_train_df["sentiment"]
y_val = model_val_df["sentiment"]
y_test = model_test_df["sentiment"]

In [185]:
def run_xgb(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=2017, num_rounds=500):
    """
    Function to Run XGBoost Model (Binary Classification)
    """
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = 'mlogloss'
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    return model    

In [186]:
model_xgb = run_xgb(train_transform, y_train, val_transform, y_val)
n_rounds = model_xgb.best_ntree_limit

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-mlogloss:1.08320	test-mlogloss:1.08784
[1]	train-mlogloss:1.06975	test-mlogloss:1.07805
[2]	train-mlogloss:1.05692	test-mlogloss:1.06933
[3]	train-mlogloss:1.04547	test-mlogloss:1.06075
[4]	train-mlogloss:1.03362	test-mlogloss:1.05261
[5]	train-mlogloss:1.02245	test-mlogloss:1.04528
[6]	train-mlogloss:1.01236	test-mlogloss:1.03933
[7]	train-mlogloss:1.00422	test-mlogloss:1.03465
[8]	train-mlogloss:0.99703	test-mlogloss:1.02933
[9]	train-mlogloss:0.98859	test-mlogloss:1.02482
[10]	train-mlogloss:0.98232	test-mlogloss:1.02038
[11]	train-mlogloss:0.97516	test-mlogloss:1.01607
[12]	train-mlogloss:0.96847	test-mlogloss:1.01276
[13]	train-mlogloss:0.96270	test-mlogloss:1.00893
[14]	train-mlogloss:0.95777	

[156]	train-mlogloss:0.72172	test-mlogloss:0.86332
[157]	train-mlogloss:0.72099	test-mlogloss:0.86253
[158]	train-mlogloss:0.72022	test-mlogloss:0.86226
[159]	train-mlogloss:0.71948	test-mlogloss:0.86201
[160]	train-mlogloss:0.71868	test-mlogloss:0.86167
[161]	train-mlogloss:0.71793	test-mlogloss:0.86091
[162]	train-mlogloss:0.71724	test-mlogloss:0.86036
[163]	train-mlogloss:0.71645	test-mlogloss:0.85972
[164]	train-mlogloss:0.71571	test-mlogloss:0.85905
[165]	train-mlogloss:0.71499	test-mlogloss:0.85853
[166]	train-mlogloss:0.71427	test-mlogloss:0.85811
[167]	train-mlogloss:0.71358	test-mlogloss:0.85771
[168]	train-mlogloss:0.71282	test-mlogloss:0.85746
[169]	train-mlogloss:0.71205	test-mlogloss:0.85710
[170]	train-mlogloss:0.71135	test-mlogloss:0.85674
[171]	train-mlogloss:0.71070	test-mlogloss:0.85662
[172]	train-mlogloss:0.71006	test-mlogloss:0.85605
[173]	train-mlogloss:0.70943	test-mlogloss:0.85593
[174]	train-mlogloss:0.70869	test-mlogloss:0.85561
[175]	train-mlogloss:0.70804	te

[317]	train-mlogloss:0.62907	test-mlogloss:0.81930
[318]	train-mlogloss:0.62867	test-mlogloss:0.81906
[319]	train-mlogloss:0.62826	test-mlogloss:0.81873
[320]	train-mlogloss:0.62787	test-mlogloss:0.81851
[321]	train-mlogloss:0.62747	test-mlogloss:0.81840
[322]	train-mlogloss:0.62705	test-mlogloss:0.81811
[323]	train-mlogloss:0.62666	test-mlogloss:0.81806
[324]	train-mlogloss:0.62627	test-mlogloss:0.81797
[325]	train-mlogloss:0.62585	test-mlogloss:0.81798
[326]	train-mlogloss:0.62544	test-mlogloss:0.81786
[327]	train-mlogloss:0.62499	test-mlogloss:0.81765
[328]	train-mlogloss:0.62445	test-mlogloss:0.81766
[329]	train-mlogloss:0.62407	test-mlogloss:0.81745
[330]	train-mlogloss:0.62361	test-mlogloss:0.81722
[331]	train-mlogloss:0.62308	test-mlogloss:0.81707
[332]	train-mlogloss:0.62265	test-mlogloss:0.81678
[333]	train-mlogloss:0.62225	test-mlogloss:0.81644
[334]	train-mlogloss:0.62185	test-mlogloss:0.81652
[335]	train-mlogloss:0.62146	test-mlogloss:0.81631
[336]	train-mlogloss:0.62105	te

[478]	train-mlogloss:0.57026	test-mlogloss:0.79472
[479]	train-mlogloss:0.56993	test-mlogloss:0.79459
[480]	train-mlogloss:0.56968	test-mlogloss:0.79460
[481]	train-mlogloss:0.56940	test-mlogloss:0.79460
[482]	train-mlogloss:0.56908	test-mlogloss:0.79455
[483]	train-mlogloss:0.56872	test-mlogloss:0.79429
[484]	train-mlogloss:0.56845	test-mlogloss:0.79394
[485]	train-mlogloss:0.56817	test-mlogloss:0.79377
[486]	train-mlogloss:0.56779	test-mlogloss:0.79359
[487]	train-mlogloss:0.56741	test-mlogloss:0.79345
[488]	train-mlogloss:0.56713	test-mlogloss:0.79329
[489]	train-mlogloss:0.56684	test-mlogloss:0.79321
[490]	train-mlogloss:0.56656	test-mlogloss:0.79327
[491]	train-mlogloss:0.56628	test-mlogloss:0.79312
[492]	train-mlogloss:0.56598	test-mlogloss:0.79291
[493]	train-mlogloss:0.56569	test-mlogloss:0.79289
[494]	train-mlogloss:0.56540	test-mlogloss:0.79289
[495]	train-mlogloss:0.56506	test-mlogloss:0.79280
[496]	train-mlogloss:0.56464	test-mlogloss:0.79280
[497]	train-mlogloss:0.56427	te

In [187]:
target_num_map = {"negative":0,
                 "neutral":1,
                 "positive":2}

In [188]:
###############################################################################################################
# Prediction on Validation set to check the threshold:
###############################################################################################################
prob_validation = model_xgb.predict(xgb.DMatrix(test_transform), ntree_limit = n_rounds)
prob_validation = pd.DataFrame(prob_validation, columns = list(target_num_map.keys()))

val_pred = prob_validation.idxmax(axis=1)
val_pred = np.array(val_pred.apply(lambda x: target_num_map[x]))

In [189]:
# precision    recall  f1-score   support

#            0       0.78      0.22      0.35      3972
#            1       0.56      0.86      0.68      5937
#            2       0.56      0.47      0.51      2375

#     accuracy                           0.58     12284
#    macro avg       0.63      0.52      0.51     12284
# weighted avg       0.63      0.58      0.54     12284

######## remove 3 rule ############
#               precision    recall  f1-score   support

#            0       0.76      0.23      0.35      3972
#            1       0.56      0.86      0.68      5937
#            2       0.59      0.49      0.54      2375

#     accuracy                           0.59     12284
#    macro avg       0.64      0.53      0.52     12284
# weighted avg       0.63      0.59      0.55     12284

print(classification_report([int(i) for i in np.array(y_test)], val_pred))

              precision    recall  f1-score   support

           0       0.37      0.82      0.51      3972
           1       0.60      0.24      0.34      5937
           2       0.65      0.29      0.40      2375

    accuracy                           0.44     12284
   macro avg       0.54      0.45      0.42     12284
weighted avg       0.54      0.44      0.41     12284



In [190]:
pickle.dump(model_xgb, open("twitter_sentiment_xgb_model.pkl", "wb"))

In [191]:
## Load TF-IDF Model:
tfidf_model = pickle.load(open("twitter_sentiment_tfidf.pkl", 'rb'))
## Load XGBoost Model:
xgboost_model = pickle.load(open("twitter_sentiment_xgb_model.pkl", 'rb'))

In [212]:
input_object = [
              {
                "id" : "1",
                "full_text" : "We are pleased to work with this @TCS client building #Cleantech in 🇨🇦. Congratulations to @QDSolar on your recent funding! 🥳 https://t.co/oghGwcQstZ"
              },
              {
                "id" : "2",
                "full_text" : "I got a call from manager and he told me it is compulsory to dispatch the product even @ekartlogistics are not going to deliver. I am not expecting such type of service by @Flipkart ."
              },
              {
                "id" : "3",
                "full_text" : "I have fantastic experience with @delhivery for large/Heavy Items, that's 5* but for others its just shame.. By the way @Flipkart @ekartlogistics (Del exec Mr Manoranjan Sahoo , Ref Id FMPN0001555570 , who has been consistently serving 5* , a big shutout to his service)."
              }
            ]

In [213]:
LABELS = ['negative', 'neutral', 'positive']

In [214]:
output_object = []
## Iterate through each input object
for item in input_object:
    text = item["full_text"]
    text = text_preprocessing(text)
    text_transform = tfidf_model.transform([text])
    scores = xgboost_model.predict(xgb.DMatrix(text_transform), ntree_limit = xgboost_model.best_ntree_limit)[0]
    out_dict = {}
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(scores.shape[0]):
        l = LABELS[ranking[i]]
        s = scores[ranking[i]]
        out_dict[l] = s  
#     item["sentiment"] = max(out_dict, key=out_dict.get) 
#     item["sentiment_score"] =  np.round(out_dict[item["sentiment"]]*100,2)
    del item["full_text"]
#     output_object.append(item)
    output_object.append(out_dict)

In [215]:
output_object

[{'positive': 0.56263685, 'negative': 0.2664164, 'neutral': 0.17094672},
 {'negative': 0.81238824, 'neutral': 0.13774158, 'positive': 0.049870174},
 {'positive': 0.4366645, 'negative': 0.36912733, 'neutral': 0.19420813}]