### Loading Required Modules

In [3]:
import pandas as pd
import numpy as np
import os
import re

# =============================================================================
# Web Scraping Utilities
# =============================================================================

import requests
from bs4 import BeautifulSoup

# =============================================================================
# Scikit-Learn Utitlies
# =============================================================================
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


# =============================================================================
# Exploratory Data Analysis
# =============================================================================

import sweetviz as sv


# =============================================================================
# ML Models Utilities
# =============================================================================
import xgboost as xgb
import lightgbm as lgb
import pickle




# =============================================================================
# Tqdm utlities
# =============================================================================
from tqdm import tqdm_notebook, tnrange
from tqdm.auto import tqdm
tqdm.pandas(desc='Progress')


# =============================================================================
# Other Utilities
# =============================================================================

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

pd.set_option('display.max_colwidth', -1)



### Load Datasets

In [30]:
## Train
train_text = open(r"./datasets/sentiment/train_text.txt", "r", encoding="utf-8")
train_labels = open(r"./datasets/sentiment/train_labels.txt", "r", encoding="utf-8")

## Validation
val_text = open(r"./datasets/sentiment/val_text.txt", "r", encoding="utf-8")
val_labels = open(r"./datasets/sentiment/val_labels.txt", "r", encoding="utf-8")

### Test
test_text = open(r"./datasets/sentiment/test_text.txt", "r", encoding="utf-8")
test_labels = open(r"./datasets/sentiment/test_labels.txt", "r", encoding="utf-8")

In [31]:
def load_datasets(text_file, label_file):
    out_ls = []
    for label, text in zip(label_file, text_file):
        i = text.strip().split("\n")
        j = label.strip().split("\n")
        out_ls.append([i[0], j[0]])

    output_df = pd.DataFrame(out_ls, columns = ["full_text", "sentiment"])
    
    return output_df

In [32]:
train_df = load_datasets(train_text, train_labels)
print("There are {} Rows and {} Columns in Train Dataset".format(train_df.shape[0], train_df.shape[1]))
val_df = load_datasets(val_text, val_labels)
print("There are {} Rows and {} Columns in Validation Dataset".format(val_df.shape[0], val_df.shape[1]))
test_df = load_datasets(test_text, test_labels)
print("There are {} Rows and {} Columns in Test Dataset".format(test_df.shape[0], test_df.shape[1]))

There are 45615 Rows and 2 Columns in Train Dataset
There are 2000 Rows and 2 Columns in Validation Dataset
There are 12284 Rows and 2 Columns in Test Dataset


In [34]:
train_df["data_flag"] = "TRAIN"
val_df["data_flag"] = "VAL"
test_df["data_flag"] = "TEST"

In [35]:
combined_df = pd.concat([train_df, val_df, test_df], axis=0, ignore_index=True)

In [36]:
combined_df.head()

Unnamed: 0,full_text,sentiment,data_flag
0,"""QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin""",2,TRAIN
1,"""Ben Smith / Smith (concussion) remains out of the lineup Thursday, Curtis #NHL #SJ""",1,TRAIN
2,Sorry bout the stream last night I crashed out but will be on tonight for sure. Then back to Minecraft in pc tomorrow night.,1,TRAIN
3,Chase Headley's RBI double in the 8th inning off David Price snapped a Yankees streak of 33 consecutive scoreless innings against Blue Jays,1,TRAIN
4,"@user Alciato: Bee will invest 150 million in January, another 200 in the Summer and plans to bring Messi by 2017""",2,TRAIN


### Data Preprocessing

In [10]:
from nltk.stem.porter import * 
stemmer = PorterStemmer()
import string

def text_preprocessing(input_txt):
    
    input_txt = str(input_txt)
    ## Remove specific pattern words
    pattern = "@[\w]*"
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    
    input_txt = re.sub(r'http\S+|www\.\S+', '', input_txt)
    
    for punct in '?!.,"$%\'()*+-/:;<=>@[\\]^_`{|}~&' + '“”’':
        input_txt = input_txt.replace(punct, ' ')
    
    input_txt = input_txt.replace("[^a-zA-Z#]", " ") 
    
    if bool(re.search(r'\d', input_txt)):
        input_txt = re.sub('[0-9]', ' ', input_txt)
            
    ## Removing Short Words:
    input_txt = ' '.join([w for w in input_txt.split() if len(w)>3])
    input_txt = input_txt.replace('"', "")
    input_txt = input_txt.replace("'", "")
    input_txt = input_txt.split()
    
    # Normalise the tokenized words:
    input_txt = " ".join([stemmer.stem(i) for i in input_txt])
    
    return input_txt.strip()

In [92]:
combined_df["full_cleaned_text"] = combined_df["full_text"].progress_apply(lambda i: text_preprocessing(i))

Progress:   0%|          | 0/59899 [00:00<?, ?it/s]

In [93]:
combined_df.head()

Unnamed: 0,full_text,sentiment,data_flag,full_cleaned_text
0,"""QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin""",2,TRAIN,origin draft book remu lupin surviv battl hogwart #happybirthdayremuslupin
1,"""Ben Smith / Smith (concussion) remains out of the lineup Thursday, Curtis #NHL #SJ""",1,TRAIN,smith smith concuss remain lineup thursday curti #nhl
2,Sorry bout the stream last night I crashed out but will be on tonight for sure. Then back to Minecraft in pc tomorrow night.,1,TRAIN,sorri bout stream last night crash will tonight sure then back minecraft tomorrow night
3,Chase Headley's RBI double in the 8th inning off David Price snapped a Yankees streak of 33 consecutive scoreless innings against Blue Jays,1,TRAIN,chase headley doubl inning david price snap yanke streak consecut scoreless inning against blue jay
4,"@user Alciato: Bee will invest 150 million in January, another 200 in the Summer and plans to bring Messi by 2017""",2,TRAIN,alciato will invest million januari anoth summer plan bring messi


### Modeling Data Preparation

In [104]:
model_train_df = combined_df[combined_df["data_flag"]=="TRAIN"][["sentiment", "full_cleaned_text"]]
model_val_df = combined_df[combined_df["data_flag"]=="VAL"][["sentiment", "full_cleaned_text"]]
model_test_df = combined_df[combined_df["data_flag"]=="TEST"][["sentiment", "full_cleaned_text"]]

In [102]:
transform_tfidf = TfidfVectorizer(ngram_range=(1,3), # ngrams - unigram to trigram
                                min_df=3, # removing terms that appear too infrequently
                                max_df=0.9, # removing terms that appear too frequently
                                strip_accents='unicode', # unicode characters
                                use_idf=1, # Enable inverse-document-frequency reweighting
                                smooth_idf=1, # Smooth idf weights by adding one to document frequencies
                                sublinear_tf=1).fit(combined_df["full_cleaned_text"]) # Apply sublinear tf scaling; replace tf with 1 + log(tf).

In [103]:
import pickle
pickle.dump(transform_tfidf, open("twitter_sentiment_tfidf.pkl", "wb"))

In [105]:
# =============================================================================
# Transforming training & testing & validation data
# =============================================================================

train_transform = transform_tfidf.transform(model_train_df["full_cleaned_text"]) # Transforming training data
val_transform = transform_tfidf.transform(model_val_df["full_cleaned_text"]) # Transforming training data
test_transform = transform_tfidf.transform(model_test_df["full_cleaned_text"]) # Transforming validation data

In [107]:
y_train = model_train_df["sentiment"]
y_val = model_val_df["sentiment"]
y_test = model_test_df["sentiment"]

In [112]:
def run_xgb(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=2017, num_rounds=500):
    """
    Function to Run XGBoost Model (Binary Classification)
    """
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = 'mlogloss'
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    return model    

In [113]:
model_xgb = run_xgb(train_transform, y_train, val_transform, y_val)
n_rounds = model_xgb.best_ntree_limit

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-mlogloss:1.07760	test-mlogloss:1.07941
[1]	train-mlogloss:1.06023	test-mlogloss:1.06311
[2]	train-mlogloss:1.04392	test-mlogloss:1.04742
[3]	train-mlogloss:1.03002	test-mlogloss:1.03467
[4]	train-mlogloss:1.01820	test-mlogloss:1.02324
[5]	train-mlogloss:1.00743	test-mlogloss:1.01328
[6]	train-mlogloss:0.99770	test-mlogloss:1.00389
[7]	train-mlogloss:0.98922	test-mlogloss:0.99639
[8]	train-mlogloss:0.98189	test-mlogloss:0.98908
[9]	train-mlogloss:0.97515	test-mlogloss:0.98318
[10]	train-mlogloss:0.96923	test-mlogloss:0.97757
[11]	train-mlogloss:0.96352	test-mlogloss:0.97224
[12]	train-mlogloss:0.95838	test-mlogloss:0.96737
[13]	train-mlogloss:0.95348	test-mlogloss:0.96313
[14]	train-mlogloss:0.94903	

[156]	train-mlogloss:0.78991	test-mlogloss:0.84208
[157]	train-mlogloss:0.78935	test-mlogloss:0.84151
[158]	train-mlogloss:0.78870	test-mlogloss:0.84141
[159]	train-mlogloss:0.78814	test-mlogloss:0.84144
[160]	train-mlogloss:0.78762	test-mlogloss:0.84126
[161]	train-mlogloss:0.78702	test-mlogloss:0.84050
[162]	train-mlogloss:0.78651	test-mlogloss:0.84012
[163]	train-mlogloss:0.78599	test-mlogloss:0.83990
[164]	train-mlogloss:0.78548	test-mlogloss:0.83969
[165]	train-mlogloss:0.78492	test-mlogloss:0.83931
[166]	train-mlogloss:0.78441	test-mlogloss:0.83918
[167]	train-mlogloss:0.78382	test-mlogloss:0.83885
[168]	train-mlogloss:0.78328	test-mlogloss:0.83876
[169]	train-mlogloss:0.78270	test-mlogloss:0.83844
[170]	train-mlogloss:0.78220	test-mlogloss:0.83815
[171]	train-mlogloss:0.78165	test-mlogloss:0.83783
[172]	train-mlogloss:0.78108	test-mlogloss:0.83743
[173]	train-mlogloss:0.78057	test-mlogloss:0.83711
[174]	train-mlogloss:0.78012	test-mlogloss:0.83678
[175]	train-mlogloss:0.77959	te

[317]	train-mlogloss:0.72142	test-mlogloss:0.80565
[318]	train-mlogloss:0.72112	test-mlogloss:0.80533
[319]	train-mlogloss:0.72080	test-mlogloss:0.80518
[320]	train-mlogloss:0.72048	test-mlogloss:0.80496
[321]	train-mlogloss:0.72013	test-mlogloss:0.80481
[322]	train-mlogloss:0.71983	test-mlogloss:0.80464
[323]	train-mlogloss:0.71951	test-mlogloss:0.80447
[324]	train-mlogloss:0.71921	test-mlogloss:0.80432
[325]	train-mlogloss:0.71891	test-mlogloss:0.80438
[326]	train-mlogloss:0.71857	test-mlogloss:0.80420
[327]	train-mlogloss:0.71820	test-mlogloss:0.80420
[328]	train-mlogloss:0.71788	test-mlogloss:0.80421
[329]	train-mlogloss:0.71758	test-mlogloss:0.80402
[330]	train-mlogloss:0.71730	test-mlogloss:0.80382
[331]	train-mlogloss:0.71695	test-mlogloss:0.80358
[332]	train-mlogloss:0.71662	test-mlogloss:0.80352
[333]	train-mlogloss:0.71629	test-mlogloss:0.80333
[334]	train-mlogloss:0.71591	test-mlogloss:0.80324
[335]	train-mlogloss:0.71557	test-mlogloss:0.80340
[336]	train-mlogloss:0.71519	te

[478]	train-mlogloss:0.67615	test-mlogloss:0.78619
[479]	train-mlogloss:0.67590	test-mlogloss:0.78619
[480]	train-mlogloss:0.67558	test-mlogloss:0.78614
[481]	train-mlogloss:0.67528	test-mlogloss:0.78587
[482]	train-mlogloss:0.67505	test-mlogloss:0.78575
[483]	train-mlogloss:0.67479	test-mlogloss:0.78587
[484]	train-mlogloss:0.67457	test-mlogloss:0.78578
[485]	train-mlogloss:0.67433	test-mlogloss:0.78580
[486]	train-mlogloss:0.67409	test-mlogloss:0.78565
[487]	train-mlogloss:0.67379	test-mlogloss:0.78568
[488]	train-mlogloss:0.67353	test-mlogloss:0.78554
[489]	train-mlogloss:0.67329	test-mlogloss:0.78556
[490]	train-mlogloss:0.67310	test-mlogloss:0.78546
[491]	train-mlogloss:0.67289	test-mlogloss:0.78548
[492]	train-mlogloss:0.67266	test-mlogloss:0.78542
[493]	train-mlogloss:0.67242	test-mlogloss:0.78546
[494]	train-mlogloss:0.67215	test-mlogloss:0.78551
[495]	train-mlogloss:0.67193	test-mlogloss:0.78541
[496]	train-mlogloss:0.67170	test-mlogloss:0.78526
[497]	train-mlogloss:0.67141	te

In [118]:
target_num_map = {"negative":0,
                 "neutral":1,
                 "positive":2}

In [119]:
###############################################################################################################
# Prediction on Validation set to check the threshold:
###############################################################################################################
prob_validation = model_xgb.predict(xgb.DMatrix(test_transform), ntree_limit = n_rounds)
prob_validation = pd.DataFrame(prob_validation, columns = list(target_num_map.keys()))

val_pred = prob_validation.idxmax(axis=1)
val_pred = np.array(val_pred.apply(lambda x: target_num_map[x]))

In [127]:
print(classification_report([int(i) for i in np.array(y_test)], val_pred))

              precision    recall  f1-score   support

           0       0.78      0.22      0.35      3972
           1       0.56      0.86      0.68      5937
           2       0.56      0.47      0.51      2375

    accuracy                           0.58     12284
   macro avg       0.63      0.52      0.51     12284
weighted avg       0.63      0.58      0.54     12284



In [129]:
pickle.dump(model_xgb, open("twitter_sentiment_xgb_model.pkl", "wb"))

In [20]:
## Load TF-IDF Model:
tfidf_model = pickle.load(open("sentiment_analysis_v2/models/twitter_sentiment_tfidf.pkl", 'rb'))
## Load XGBoost Model:
xgboost_model = pickle.load(open("sentiment_analysis_v2/models/twitter_sentiment_xgb_model.pkl", 'rb'))

In [21]:
input_object = [
              {
                "id" : "1",
                "full_text" : "what a pathetic display of service and logistics. Till now I haven’t received the product. There is no update on the app since 22nd Mar’21. I will suggest everyone to delete Flipkart app and try Amazon. @jagograhakjago Act and protect us"
              },
              {
                "id" : "2",
                "full_text" : "@Flipkart @flipkartsupport @ekartlogistics 24th Mar’21 is almost over & you’ve failed in your commitment twice. So please don’t say about service and commitment at all. Moreover spend some time in training your Ekart logistics employees so that they can serve customers better."
              },
              {
                "id" : "3",
                "full_text" : "@Benioff @MichaelDell @Dell Are you guys working on that SPAC together ?"
              }
            ]

In [22]:
LABELS = ['negative', 'neutral', 'positive']

In [23]:
output_object = []
## Iterate through each input object
for item in input_object:
    text = item["full_text"]
    text = text_preprocessing(text)
    text_transform = tfidf_model.transform([text])
    scores = xgboost_model.predict(xgb.DMatrix(text_transform), ntree_limit = xgboost_model.best_ntree_limit)[0]
    out_dict = {}
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(scores.shape[0]):
        l = LABELS[ranking[i]]
        s = scores[ranking[i]]
        out_dict[l] = s  
#     item["sentiment"] = max(out_dict, key=out_dict.get) 
#     item["sentiment_score"] =  np.round(out_dict[item["sentiment"]]*100,2)
    del item["full_text"]
#     output_object.append(item)
    output_object.append(out_dict)

In [24]:
output_object

[{'negative': 0.5752041, 'neutral': 0.24172834, 'positive': 0.18306759},
 {'positive': 0.35401323, 'negative': 0.33458912, 'neutral': 0.3113976},
 {'neutral': 0.50865966, 'positive': 0.32150555, 'negative': 0.16983478}]