In [1]:
import pandas as pd
import nltk
import re
import string

**nltk (Natural Language Toolkit):**A popular library for natural language processing (NLP) tasks, such as tokenization, stemming, lemmatization, and more.
If you're planning to use it, ensure that the necessary data packages (e.g., stopwords, tokenizers) are downloaded using nltk.download().

**re (Regular Expressions):**Provides powerful pattern-matching capabilities for text processing.
Commonly used for tasks like searching, replacing, or validating patterns in text.

**string:**Contains constants and utilities related to string operations, such as string.punctuation (a list of punctuation characters) or string.ascii_letters.

In [2]:
df=pd.read_csv(r'/content/train (1).csv',sep=';',header=None)
df.head()

Unnamed: 0,0,1
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       16000 non-null  object
 1   1       16000 non-null  object
dtypes: object(2)
memory usage: 250.1+ KB


In [4]:
df.isnull().sum()

Unnamed: 0,0
0,0
1,0


In [5]:
df=df.rename(columns={0:'Text',1:'Sentiments'})
df.head()

Unnamed: 0,Text,Sentiments
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [6]:
df['Sentiments'].value_counts()

Unnamed: 0_level_0,count
Sentiments,Unnamed: 1_level_1
joy,5362
sadness,4666
anger,2159
fear,1937
love,1304
surprise,572


In [7]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

**nltk.download('punkt'):**Downloads the Punkt tokenizer model, which is used for sentence tokenization and word tokenization.
**nltk.download('stopwords')**:Downloads a list of common stopwords (eg., "and," "the," "is") for various languages. These words are often removed during text preprocessing.
**nltk.download('wordnet'):**Downloads the WordNet lexical database, which is used for tasks like finding synonyms, antonyms, hypernyms, and hyponyms.
**nltk.download('omw-1.4'):**Downloads the Open Multilingual WordNet (version 1.4), which provides multilingual support for WordNet.

In [8]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [9]:
lemma=WordNetLemmatizer()
df['cleaned']=df['Text'].str.replace('[^a-zA-Z]',' ',regex=True).str.lower()
df['cleaned']=df['cleaned'].str.split()
df['lemma']=df['cleaned'].apply(lambda words:' '.join([lemma.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]))
df.drop('cleaned',axis=True,inplace=True)

In [10]:
df.head()

Unnamed: 0,Text,Sentiments,lemma
0,i didnt feel humiliated,sadness,didnt feel humiliated
1,i can go from feeling so hopeless to so damned...,sadness,go feeling hopeless damned hopeful around some...
2,im grabbing a minute to post i feel greedy wrong,anger,im grabbing minute post feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...,love,ever feeling nostalgic fireplace know still pr...
4,i am feeling grouchy,anger,feeling grouchy


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

**TfidfVectorizer from sklearn.feature_extraction.text**

**Purpose:** Converts a collection of text documents into a matrix of Term Frequency-Inverse Document Frequency (TF-IDF) features.

**Common Usage:**Transform textual data into numerical vectors that machine learning models can process.
Helps capture the importance of words relative to a document and the entire corpus.

In [12]:
tfidf=TfidfVectorizer(max_features=500,stop_words='english')
x=tfidf.fit_transform(df['lemma']).toarray()

In [13]:
xx=pd.DataFrame(x)
xx['Sentiments']=df['Sentiments']

In [14]:
xx.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,Sentiments
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,sadness
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,sadness
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.491894,0.0,0.0,0.0,0.0,0.0,anger
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,love
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,anger


In [15]:
y = xx['Sentiments']

x_train, x_test, y_train, y_test = train_test_split(xx.drop('Sentiments',axis=1), y, test_size=0.2, random_state=42)

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
model = LogisticRegression()

model.fit(x_train, y_train)

# Make Predictions
y_pred = model.predict(x_test)

# Evaluate Model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7446875

Classification Report:
               precision    recall  f1-score   support

       anger       0.86      0.64      0.73       427
        fear       0.82      0.63      0.71       397
         joy       0.67      0.88      0.76      1021
        love       0.78      0.54      0.64       296
     sadness       0.77      0.78      0.77       946
    surprise       0.82      0.59      0.69       113

    accuracy                           0.74      3200
   macro avg       0.79      0.68      0.72      3200
weighted avg       0.76      0.74      0.74      3200



In [17]:
pd.__version__

'2.2.2'

In [18]:
nltk.__version__

'3.9.1'

In [19]:
re.__version__

'2.2.1'

In [20]:
import pickle

In [21]:
with open('TFV.pkl','wb') as file:
  pickle.dump(tfidf,file)

In [22]:
with open('lem.pkl','wb') as file:
  pickle.dump(lemma,file)

In [23]:
with open('model.pkl','wb') as file:
  pickle.dump(model,file)

In [24]:
import sklearn

In [25]:
sklearn.__version__

'1.6.0'

In [26]:
x_train.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
676,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12123,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
df['Text'].head()

Unnamed: 0,Text
0,i didnt feel humiliated
1,i can go from feeling so hopeless to so damned...
2,im grabbing a minute to post i feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...
4,i am feeling grouchy


In [28]:
df[df['Sentiments']=='love']

Unnamed: 0,Text,Sentiments,lemma
3,i am ever feeling nostalgic about the fireplac...,love,ever feeling nostalgic fireplace know still pr...
9,i feel romantic too,love,feel romantic
47,i can t let go of that sad feeling that i want...,love,let go sad feeling want accepted first home mine
61,i ate i could feel a gentle tingle throughout ...,love,ate could feel gentle tingle throughout almost...
68,i suppose my own truth needs to be shared i ha...,love,suppose truth need shared havent feeling faith...
...,...,...,...
15936,im better than the rest of you feeling but a f...,love,im better rest feeling feeling accepted
15958,i feel there is going to be a sequel and i wou...,love,feel going sequel would liked closure book ending
15963,i love what i do and i feel so blessed and luc...,love,love feel blessed lucky able travel creative m...
15971,i feel it would not be loving of me not warn y...,love,feel would loving warn impending social crisis...
