#Import necessary packages

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from string import punctuation
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
import re
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


#Loading the dataset using Pandas Data frame

In [3]:
df=pd.read_csv(r'/content/Comments.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Comment,Sentiment
0,0,Hahaha😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂...,Neutral
1,1,He is moaning like his butt got fucked 😂😂😂😂,Negative
2,2,So not cool! Btw he tapped out twice and you s...,Negative
3,3,Best part. There there. Pat's on the back.,Positive
4,4,he sounds like a bata-male. or a women.,Positive


#Data Preprocessing
i)clean comments

ii)text_preprocessing

ii)remove stopwords,punctuations

iii)stemming

iv)lemmatization

v)tokenization



In [5]:
#cleaning comments by removing emojis,links and html tags
def clean_comment(comment):
    # Remove emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    comment = emoji_pattern.sub(r'', comment)

    # Remove links
    comment = re.sub(r'http\S+', '', comment)

    # Remove HTML tags
    comment = BeautifulSoup(comment, "html.parser").get_text()
    comment = re.sub(r'<a\s+(?:[^>]*?\s+)?href=(["\'])(.*?)\1[^>]*>(.*?)</a>', r'\3', comment)

    # Remove extra whitespace
    comment = re.sub('\s+', ' ', comment).strip()

    return comment

In [8]:
stop_words = stopwords.words('english')
porter_stemmer = PorterStemmer()
lzr = WordNetLemmatizer()

In [9]:
#removing negative words
stopwords=[
    'no', 'very', "don't", "aren't", "couldn't", "didn't",
    "doesn't", "hadn't", "hasn't", "haven't", "isn't", "mightn't",
    "mustn't", "needn't", "shan't", "shouldn't", "wasn't", "weren't",
    'won', "won't", "wouldn't"
]
stop_words=[word for word in stop_words if word not in stopwords]

In [10]:
def text_processing(text):
    # convert text into lowercase
    text = text.lower()

    # remove new line characters in text
    text = re.sub(r'\n',' ', text)

    # remove punctuations from text
    text = re.sub('[%s]' % re.escape(punctuation), "", text)

    # remove references and hashtags from text
    text = re.sub("^a-zA-Z0-9$,.", "", text)

    # remove multiple spaces from text
    text = re.sub(r'\s+', ' ', text, flags=re.I)

    # remove special characters from text
    text = re.sub(r'\W', ' ', text)

    text = ' '.join([word for word in word_tokenize(text) if word not in stop_words])

    # stemming using porter stemmer from nltk package - msh a7sn 7aga - momken: lancaster, snowball
    text=' '.join([porter_stemmer.stem(word) for word in word_tokenize(text)])

    # lemmatizer using WordNetLemmatizer from nltk package
    text=' '.join([lzr.lemmatize(word) for word in word_tokenize(text)])

    return text

In [15]:
#copying the original data
df_copy=df.copy()
df_copy['Comment']=df_copy['Comment'].apply(text_processing)

In [16]:
df_copy

Unnamed: 0.1,Unnamed: 0,Comment,Sentiment
0,0,hahaha,Neutral
1,1,moan like butt got fuck,Negative
2,2,cool btw tap twice still help beyond danger ev...,Negative
3,3,best part pat back,Positive
4,4,sound like batamal woman,Positive
...,...,...,...
7722,7722,hi,Neutral
7723,7723,bet,Neutral
7724,7724,that funni,Positive
7725,7725,nice vid,Positive


In [18]:
df_copy['Sentiment'].value_counts()

Sentiment
Positive    2954
Neutral     2434
Negative    2339
Name: count, dtype: int64

In [20]:
#encoding sentiment to 0:Negative,1:Neutral,2:Positive
df_copy['encoded_sentiment']=df['Sentiment'].map({'Positive':2,'Neutral':1,'Negative':0})
df_copy

Unnamed: 0.1,Unnamed: 0,Comment,Sentiment,encoded_sentiment
0,0,hahaha,Neutral,1
1,1,moan like butt got fuck,Negative,0
2,2,cool btw tap twice still help beyond danger ev...,Negative,0
3,3,best part pat back,Positive,2
4,4,sound like batamal woman,Positive,2
...,...,...,...,...
7722,7722,hi,Neutral,1
7723,7723,bet,Neutral,1
7724,7724,that funni,Positive,2
7725,7725,nice vid,Positive,2


In [21]:
df_neutral=df_copy[df_copy['Sentiment']=='Neutral']
df_positive=df_copy[df_copy['Sentiment']=='Positive']
df_negative=df_copy[df_copy['Sentiment']=='Negative']

In [22]:
X_train=df_copy['Comment']
X_test=df_copy['Comment']
Y_train=df_copy['encoded_sentiment']
Y_test=df_copy['encoded_sentiment']

In [23]:
def custom_tokenizer(text):
    return text.split()
# Initialize TfidfVectorizer with the custom tokenizer
tfidf = TfidfVectorizer(tokenizer=custom_tokenizer)
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [29]:
#save the vectorixed file
import pickle
import pickle
# Save the vectorizer to a file
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

#Train the  XG Boost model using the training data


In [24]:
#training the model
xg=XGBClassifier()
xg.fit(X_train_tfidf,Y_train)

In [25]:
accuracy_scores = {}
xg_pred = xg.predict(X_test_tfidf)
print(confusion_matrix(Y_test,xg_pred))
print(classification_report(Y_test,xg_pred))
accuracy_scores['LR'] = accuracy_score(Y_test,xg_pred)

[[1987  211  141]
 [  52 2289   93]
 [  78  197 2679]]
              precision    recall  f1-score   support

           0       0.94      0.85      0.89      2339
           1       0.85      0.94      0.89      2434
           2       0.92      0.91      0.91      2954

    accuracy                           0.90      7727
   macro avg       0.90      0.90      0.90      7727
weighted avg       0.90      0.90      0.90      7727



#Pickle file to save the model

In [27]:
import pickle

In [28]:
file_name='xgboost_model.sav'
pickle.dump(xg,open(file_name,'wb'))