In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df=pd.read_csv("train.txt",sep=';',header=None,names=['text','emotion'])

In [3]:
df.head(5)

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


# Data Cleaning

In [4]:
df.isnull().sum()

text       0
emotion    0
dtype: int64

In [5]:
unique_emotion=df['emotion'].unique()

In [6]:
emotion_numbers={ }
i=0
for emotion in unique_emotion:
    emotion_numbers[emotion]=i
    i+=1
emotion_numbers    

{'sadness': 0, 'anger': 1, 'love': 2, 'surprise': 3, 'fear': 4, 'joy': 5}

In [7]:
df['emotion_rate']=df['emotion'].map(emotion_numbers)

In [8]:
df.head(5)

Unnamed: 0,text,emotion,emotion_rate
0,i didnt feel humiliated,sadness,0
1,i can go from feeling so hopeless to so damned...,sadness,0
2,im grabbing a minute to post i feel greedy wrong,anger,1
3,i am ever feeling nostalgic about the fireplac...,love,2
4,i am feeling grouchy,anger,1


## Labeling emotions

In [9]:
df_rate=df.drop(columns=['emotion'])

In [10]:
df_rate.head(5)

Unnamed: 0,text,emotion_rate
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


## Lowering case

In [11]:
df_rate['text']=df['text'].apply(lambda x: x.lower())

In [12]:
df_rate.head(3)

Unnamed: 0,text,emotion_rate
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1


## Removing Punctuation

In [13]:
import string
def remove_punctuation(txt):
    return txt.translate(str.maketrans('','',string.punctuation))

In [14]:
df_rate['text']=df['text'].apply(remove_punctuation)

In [15]:
df_rate.head(3)

Unnamed: 0,text,emotion_rate
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1


## Removing digits

In [16]:
def remove_numbers(txt):
  new=""
  for i in txt:
        if not i.isdigit():
            new=new+i
  return new         
   

In [17]:
df_rate['text']=df['text'].apply(remove_numbers)

In [18]:
df_rate.head(3)

Unnamed: 0,text,emotion_rate
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1


## Removing url/links

In [19]:
import re

def remove_urls(text):
    # Regex to match URLs (http, https, www, etc.)
    url_pattern = r'http\S+|www\.\S+'
    # Replace all URLs with an empty string
    return re.sub(url_pattern, '', text)


In [20]:
df_rate['text']=df['text'].apply(remove_urls)

In [21]:
df_rate.head(3)

Unnamed: 0,text,emotion_rate
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1


## Removing HTML tags

In [22]:
import re

def remove_html_tags(text):
    # Pattern to match anything between < and >
    html_pattern = r'<[^>]+>'
    # Replace tags with empty string
    return re.sub(html_pattern, '', text)


In [23]:
df_rate['text']=df['text'].apply(remove_html_tags)

In [24]:
df_rate.head(3)

Unnamed: 0,text,emotion_rate
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1


## Removing emojis

In [25]:
import re

def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons üòÄ üòÉ üòÑ
        "\U0001F300-\U0001F5FF"  # symbols & pictographs üåÄ üåà üéâ
        "\U0001F680-\U0001F6FF"  # transport & map symbols üöó üöÄ ‚úàÔ∏è
        "\U0001F1E0-\U0001F1FF"  # flags üáÆüá≥ üá∫üá∏
        "\U00002500-\U00002BEF"  # Chinese/Japanese symbols, etc.
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "\U0001f926-\U0001f937"
        "\U00010000-\U0010ffff"
        "\u2640-\u2642" 
        "\u2600-\u2B55"
        "\u200d"
        "\u23cf"
        "\u23e9"
        "\u231a"
        "\ufe0f"  # dingbats
        "\u3030"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub('', text)

    ## second method

    def emoji_remove(txt):
        new=""
        for i in txt:
           if i.isascii():
               new=new+i
        return new


In [26]:
df_rate['text']=df['text'].apply(remove_emojis)

In [27]:
df_rate.head(3)

Unnamed: 0,text,emotion_rate
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1


## remove stop words

In [28]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def remove_stopwords(text):
    # Tokenize the text into words
    words = word_tokenize(text)
    
    # Get English stopwords (like 'the', 'is', 'in', 'and', etc.)
    stop_words = set(stopwords.words('english'))
    
    # Keep only words that are NOT stopwords
    filtered_words = [word for word in words if word.lower() not in stop_words]
    
    # Join them back into a single string
    return ' '.join(filtered_words)


In [30]:
df_rate['text']=df['text'].apply(remove_stopwords)

In [31]:
df_rate.head(5)

Unnamed: 0,text,emotion_rate
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,1
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,1


## Bag of words

In [39]:
from sklearn.feature_extraction.text import CountVectorizer
documents=[
            "I love pizza",
             "pizza is the best",
               "Pasta is great"
                                 ]
vect=CountVectorizer(ngram_range=(2,2))
x=vect.fit_transform(documents)
vect.get_feature_names_out()

array(['is great', 'is the', 'love pizza', 'pasta is', 'pizza is',
       'the best'], dtype=object)

In [40]:
x.toarray()

array([[0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 1, 1],
       [1, 0, 0, 1, 0, 0]])

In [41]:
vect=CountVectorizer(ngram_range=(3,3))
x=vect.fit_transform(df_rate['text'])
vect.get_feature_names_out()

array(['aa full force', 'aa meeting hear', 'aa meeting today', ...,
       'zumba half hour', 'zumba lame housewife', 'zz top logo'],
      dtype=object)

In [42]:
len(vect.get_feature_names_out())

112264

In [43]:
x.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## TF-IDF

#### Term frequency(counting of each word) - Inverse document frequency

TF X IDF = Term Frequence

TF= (no of occurence of term in document)/(Total numbers of terms in document)

TF:-[0,1]

IDF=(loge(Total no of documents in corpus))/(Number of doc with term in them)

IDF:-[0,more then 1] (rarity)

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(X.shape)

['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
(4, 9)


In [46]:
X.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

## Project Continue

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['emotion'], test_size=0.20, random_state=42)


## Naive bayes

In [50]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)


nb_model = MultinomialNB()
nb_model.fit(X_train_bow, y_train)


pred_bow = nb_model.predict(X_test_bow)
print(accuracy_score(y_test, pred_bow))

0.7390625


In [51]:
pred_bow


array(['sadness', 'joy', 'sadness', ..., 'joy', 'joy', 'sadness'],
      dtype='<U8')

In [52]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [53]:
nb2_model = MultinomialNB()
nb2_model.fit(X_train_tfidf,y_train)

In [54]:
y_pred = nb2_model.predict(X_test_tfidf)


In [55]:
print(accuracy_score(y_test, y_pred))


0.6175


## Logistic regression

In [56]:
from sklearn.linear_model import LogisticRegression


In [57]:
logistic_model = LogisticRegression(max_iter=1000)


In [58]:
logistic_model.fit(X_train_tfidf,y_train)


In [59]:
log_pred = logistic_model.predict(X_test_tfidf)


In [60]:
print(accuracy_score(y_test,log_pred ))


0.84125


In [61]:
logistic_model.fit(X_train_bow,y_train)


In [62]:
log_pred = logistic_model.predict(X_test_bow)


In [63]:
print(accuracy_score(y_test,log_pred ))


0.883125


## SVM

In [67]:
from sklearn.svm import SVC
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_bow, y_train)

In [68]:
y_pred = svm_model.predict(X_test_bow)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8728125


In [69]:
svm_model.fit(X_train_tfidf, y_train)

In [70]:
y_pred = svm_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.88


In [None]:
import joblib
joblib.dump(logistic_model,'LOG_NLP.pkl')
joblib.dump(bow_vectorizer,'bow.pkl')
joblib.dump(X.columns.tolist(),'columns.pkl')