#### Loading Python Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import neattext.functions as nfx
import re
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import warnings
import joblib


warnings.filterwarnings('ignore')
sns.set_style('whitegrid')

#### Loading the Dataset

In [2]:
df = pd.read_csv("../Dataset/Refined_Dataset.csv")
df.head()

Unnamed: 0,Author,UTC,Year,Month,ID,Comments,Score,Text,Title,Word_Count,Emotions
0,3dgyT33N666,2020-10-13 17:22:17,2020,October,jahyyq,0.0,1.0,im fucking im tired hell sad hell emotions con...,dear,25,"['negative', 'anger', 'disgust', 'fear', 'sadn..."
1,TallGhostXO,2019-12-18 01:24:12,2019,December,ec5gwf,3.0,1.0,hate fucking burning passion theres person ear...,fucking hate myself,53,"['anger', 'disgust', 'fear', 'negative', 'sadn..."
2,dojacool,2020-07-26 05:16:21,2020,July,hy19hz,4.0,1.0,conversation night what do chest crushed rock ...,met year ago shes russia im manila sad want die,95,"['anger', 'disgust', 'fear', 'negative', 'sadn..."
3,Klutzy_Lemon,2020-02-14 07:00:44,2020,February,f3oikf,0.0,1.0,[removed],birthday today,1,[]
4,Arandomoboy,2019-09-21 14:33:33,2019,September,d7b8ft,1.0,6.0,changed kept feels like control life strugglin...,im finally happy,24,"['disgust', 'fear', 'negative', 'sadness', 'an..."


#### Initialization of DataFrame

In [3]:
data = pd.DataFrame()
data['Text'] = df['Title'].fillna('')+' '+df['Text'].fillna('')

data.head()

Unnamed: 0,Text
0,dear im fucking im tired hell sad hell emotion...
1,fucking hate myself hate fucking burning passi...
2,met year ago shes russia im manila sad want di...
3,birthday today [removed]
4,im finally happy changed kept feels like contr...


#### Cleaning the Data

In [4]:
def clean(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'\br\/\w+', '', text)
    text = text.replace('\n', ' ')
    text = nfx.remove_emails(text)
    text = nfx.remove_stopwords(text)
    text = nfx.remove_urls(text)
    text = nfx.remove_userhandles(text)
    text = nfx.remove_phone_numbers(text)
    text = nfx.remove_emojis(text)
    text = nfx.remove_puncts(text)
    text = nfx.remove_multiple_spaces(text)
    return text.strip()

data['Clean_Text'] = data['Text'].apply(clean)
data.head()

Unnamed: 0,Text,Clean_Text
0,dear im fucking im tired hell sad hell emotion...,dear im fucking im tired hell sad hell emotion...
1,fucking hate myself hate fucking burning passi...,fucking hate hate fucking burning passion ther...
2,met year ago shes russia im manila sad want di...,met year ago shes russia im manila sad want di...
3,birthday today [removed],birthday today [removed]
4,im finally happy changed kept feels like contr...,im finally happy changed kept feels like contr...


#### Finding out the Null Values

In [5]:
print(data.isna().sum())

Text          0
Clean_Text    0
dtype: int64


#### Creation of Emotion Label

In [6]:
analyzer = SentimentIntensityAnalyzer()

def get_sentiment_label(text) :
    score = analyzer.polarity_scores(text)['compound']
    if score >= 0.05 :
        return "Positive"
    elif score <= -0.05 :
        return "Negative"
    else :
        return "Neutral"
    
data['Sentiment'] = data['Clean_Text'].apply(get_sentiment_label)
data.head()

Unnamed: 0,Text,Clean_Text,Sentiment
0,dear im fucking im tired hell sad hell emotion...,dear im fucking im tired hell sad hell emotion...,Negative
1,fucking hate myself hate fucking burning passi...,fucking hate hate fucking burning passion ther...,Negative
2,met year ago shes russia im manila sad want di...,met year ago shes russia im manila sad want di...,Positive
3,birthday today [removed],birthday today [removed],Neutral
4,im finally happy changed kept feels like contr...,im finally happy changed kept feels like contr...,Positive


In [7]:
data['Sentiment'].value_counts()

Sentiment
Negative    17461
Positive    10641
Neutral      1397
Name: count, dtype: int64

#### Imput Features and Target Variables

In [8]:
x = data['Clean_Text']
y = data['Sentiment']

#### Splitting dataset into Training and Testing Set

In [9]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)

#### Finding out the Shapes of the Training and Testing Sets

In [10]:
print('X-Train Shape :')
x_train.shape

X-Train Shape :


(23599,)

In [11]:
print('Y-Train Shape :')
y_train.shape

Y-Train Shape :


(23599,)

In [12]:
print('X-Test Shape :')
x_test.shape

X-Test Shape :


(5900,)

In [13]:
print('Y-Test Shape :')
y_test.shape

Y-Test Shape :


(5900,)

#### Creation of the Pipeline

In [14]:
model = Pipeline(
    [
        ('tfidf',TfidfVectorizer(ngram_range=(1,1), max_features=5000)),
        # ('rfc',RandomForestClassifier(n_estimators=100,random_state=42))
        ('lor',LogisticRegression(max_iter=1000))
    ]
)

#### Training the Model

In [15]:
model.fit(x_train,y_train)

0,1,2
,steps,"[('tfidf', ...), ('lor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [16]:
joblib.dump(model,'../Models/Model.pkl')

['../Models/Model.pkl']

In [17]:
joblib.dump(x_test,'../Models/x_test.pkl')


['../Models/x_test.pkl']

In [18]:
joblib.dump(y_test,'../Models/y_test.pkl')

['../Models/y_test.pkl']