# Importing all the required libraries

In [112]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler,PowerTransformer,FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,LancasterStemmer,SnowballStemmer,wordnet
import re
from sklearn.naive_bayes import BernoulliNB,MultinomialNB,CategoricalNB
from sklearn.metrics import accuracy_score
from nltk.stem import WordNetLemmatizer
import pickle

# Importing the dataset

In [2]:
data = pd.read_csv(r"C:\Users\srava\Downloads\Emotion_classify_Data.csv")

In [3]:
data

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear
...,...,...
5932,i begun to feel distressed for you,fear
5933,i left feeling annoyed and angry thinking that...,anger
5934,i were to ever get married i d have everything...,joy
5935,i feel reluctant in applying there because i w...,fear


In [4]:
data1 = data.copy()

In [5]:
data['Emotion'].value_counts()

anger    2000
joy      2000
fear     1937
Name: Emotion, dtype: int64

# Dividing the Feature variable and class variable

In [32]:
fv = data.iloc[:,0]
cv = data.iloc[:,1]

In [33]:
# Splitting the data into train and test

In [34]:
x_train,x_test,y_train,y_test = train_test_split(fv,cv,test_size=0.2,random_state=42)

# Text preprocessing

In [35]:
# checking if there are any uppercase letters, html tags, urls and unwanted characters

In [39]:
def edat(data,name):
    case=" ".join(data[name]).islower()
    html_=data[name].apply(lambda x:True if re.search("<.+?>",x) else False).sum()
    url_=data[name].apply(lambda x:True if re.search("http[s]?://.+? +",x) else False).sum()
    unwanted_=data[name].apply(lambda x:True if re.search("[]()*\-.,@#$%^&0-9]",x) else False).sum()
    if case==False:
        print("not in lower case")
    if html_>0:
        print("have html tags")
    if url_>0:
        print("you are having urls")
    if unwanted_>0:
        print("you are having unwanted characters")
    else:
        print('The data is clean')

In [40]:
edat(data,'Comment')

The data is clean


In [41]:
# The data is clean

## Removing stop words

In [None]:
# removing stop words except 'not'

In [77]:
custom_stopwords = set(stopwords.words('english')) - {'not'}

In [78]:
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in custom_stopwords]
    return ' '.join(filtered_words)


In [82]:
x_train = x_train.apply(remove_stopwords)


# Lemmatizing the Text

In [95]:
lemmatizer = WordNetLemmatizer()

In [96]:
def lemmatize_text(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

In [98]:
x_train=x_train.apply(lemmatize_text)


# Converting text to Binary bag of words

In [99]:
ct = CountVectorizer(binary=True)

In [100]:
bbow = ct.fit_transform(x_train)

In [101]:
bbow

<4749x7039 sparse matrix of type '<class 'numpy.int64'>'
	with 43482 stored elements in Compressed Sparse Row format>

# Preparing model

In [102]:
br = BernoulliNB()

In [103]:
final_pred = br.fit(bbow,y_train)

In [104]:
final_pred

In [105]:
cv2 = ct.transform(x_test)

In [106]:
y_pred = br.predict(cv2)

In [107]:
accuracy_score(y_test,y_pred)

0.9057239057239057

In [108]:
def pred_y():
    num = int(input())
    print("*"*50)
    predicted_y = final_pred.predict(ct.transform(x_test.iloc[[num]]))
    print(x_test.iloc[num])
    print("*"*50)
    return predicted_y[0]

In [109]:
pred_y()

87
**************************************************
i feel resentful about my education rel bookmark why i feel resentful about my education a class entry author href http liveagainsttheflow
**************************************************


'anger'

# Deployement 

In [113]:
final_model = pickle.dump(final_pred,open(r'C:\Users\srava\Downloads\Emotion_model.pkl','wb'))

In [114]:
model = pickle.load(open(r"C:\Users\srava\Downloads\Emotion_model.pkl",'rb'))

In [116]:
model.predict(ct.transform(x_test.iloc[[76]]))

array(['joy'], dtype='<U5')

In [135]:
def pred():
    text = input('enter a text: ')
    result = model.predict(ct.transform([text]))
    return result[0]

In [136]:
pred()

enter a text: i feel resentful about my education rel bookmark why i feel resentful about my education a class entry author href http liveagainsttheflow


'anger'

In [141]:
x_test.iloc[642]

'i am also in an exciting space i have to admit i am feeling curiously excitedly optimistic about the future'