#Text_Exploration

In [3]:
import pandas as pd

# Load the training data
file_path ='../data/text/training.csv'  # Adjust path if needed
df = pd.read_csv(file_path)  # Adjust path if needed

# Show the first 5 rows
print(df.head())

# Check the number of rows and columns
print("shape")
print(df.shape)


# View column names
print("column")
print(df.columns)

# See a summary of the data
print("Info")
print(df.info())

# Check the distribution of emotion labels
print(df['emotion'].value_counts())


                                                text  emotion
0                            i didnt feel humiliated  sadness
1  i can go from feeling so hopeless to so damned...  sadness
2   im grabbing a minute to post i feel greedy wrong    anger
3  i am ever feeling nostalgic about the fireplac...     love
4                               i am feeling grouchy    anger
shape
(16000, 2)
column
Index(['text', 'emotion'], dtype='object')
Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     16000 non-null  object
 1   emotion  16000 non-null  object
dtypes: object(2)
memory usage: 250.1+ KB
None
emotion
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64


#preprocessing

In [4]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import string

# Download NLTK data (if not already downloaded)
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\raada\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\raada\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\raada\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:

# Load the training data
file_path ='../data/text/training.csv'  # Adjust path if needed
df = pd.read_csv(file_path)

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
df['cleaned_text'] = df['text'].apply(preprocess_text)
print(df[['text', 'cleaned_text']].head())

# Save preprocessed data for use in other scripts
df.to_csv('../data/text/cleaned_training.csv', index=False)



                                                text  \
0                            i didnt feel humiliated   
1  i can go from feeling so hopeless to so damned...   
2   im grabbing a minute to post i feel greedy wrong   
3  i am ever feeling nostalgic about the fireplac...   
4                               i am feeling grouchy   

                                        cleaned_text  
0                              didnt feel humiliated  
1  go feeling hopeless damned hopeful around some...  
2          im grabbing minute post feel greedy wrong  
3  ever feeling nostalgic fireplace know still pr...  
4                                    feeling grouchy  


#train_model


In [6]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

# Load the preprocessed data
df = pd.read_csv('../data/text/cleaned_training.csv')

# Convert text to numerical features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['emotion']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


In [8]:
#train Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))

# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))


Logistic Regression Classification Report:
              precision    recall  f1-score   support

       anger       0.90      0.80      0.85       427
        fear       0.86      0.74      0.80       397
         joy       0.79      0.96      0.87      1021
        love       0.89      0.56      0.69       296
     sadness       0.89      0.94      0.91       946
    surprise       0.88      0.44      0.59       113

    accuracy                           0.85      3200
   macro avg       0.87      0.74      0.78      3200
weighted avg       0.86      0.85      0.84      3200

Naive Bayes Classification Report:
              precision    recall  f1-score   support

       anger       0.93      0.27      0.42       427
        fear       0.92      0.21      0.34       397
         joy       0.59      0.99      0.74      1021
        love       1.00      0.02      0.04       296
     sadness       0.70      0.93      0.80       946
    surprise       0.00      0.00      0.00       113


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [9]:
import joblib

best_model = lr_model

# Save the best model
joblib.dump(best_model, '../models/text_emotion_model.pkl')

# Save the vectorizer
joblib.dump(vectorizer, '../models/text_vectorizer.pkl')

['../models/text_vectorizer.pkl']

#main

In [10]:
import joblib

model = joblib.load('../models/text_emotion_model.pkl')
vectorizer = joblib.load('../models/text_vectorizer.pkl')  

new_text = "Its a sad sad day"
cleaned_text = preprocess_text(new_text)
X_new = vectorizer.transform([cleaned_text])
prediction = model.predict(X_new)
print(prediction)

['sadness']
