# AI 2024 Online Summer Internship
### Name: Rasikh Ali
### Email: rasikhali1234@gmail.com

<div class="alert alert-block alert-info">
    <h1> Libraries </h1>
</div>

In [1]:
import re
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
import nltk
import joblib
import seaborn as sns
import matplotlib.pyplot as plt

C:\Users\ABC\anaconda3\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
C:\Users\ABC\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


<div class="alert alert-block alert-info">
    <h1> Loading Dataset </h1>
</div>

In [2]:
data = pd.read_csv('sampled_emotion_data.csv')

print("\n\nSentiment Analysis Data:")
print("============\n")
pd.set_option("display.max_rows", None, "display.max_columns", None)
print(f'Sample data count = {len(data)}\n')
print(data.head())
print(data.tail())

# sentiment_type
# 1 is positive
# 0 is positive



Sentiment Analysis Data:

Sample data count = 110

              ID                                              Tweet  anger  \
0  2017-En-10331                        Need a new outlet for #rage      1   
1  2017-En-21898  Arguing with these people doesn't work anyway,...      1   
2  2017-En-10398  When your sister is 19 and throws legitimate t...      1   
3  2017-En-40975  #HRmanagement must discourage the expediency f...      1   
4  2017-En-30811  It is too fucking bright &amp; too fucking hot...      1   

   anticipation  disgust  fear  joy  love  optimism  pessimism  sadness  \
0             0        1     0    0     0         0          0        0   
1             0        1     1    0     0         0          0        0   
2             0        1     0    0     0         0          0        0   
3             0        1     0    0     0         1          0        0   
4             0        1     0    0     0         0          0        0   

   surprise  trust  
0     

<div class="alert alert-block alert-warning">
    <h2> Display columns </h2>
</div>

In [3]:
data.columns

Index(['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy',
       'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
      dtype='object')

<div class="alert alert-block alert-warning">
    <h2> Keeping required columns in dataset </h2>
</div>

In [4]:
data = data.drop(columns=['ID'])

In [5]:
data.columns

Index(['Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love',
       'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
      dtype='object')

<div class="alert alert-block alert-info">
    <h1> Understand and Pre-process Sample Data </h1>
</div>

<div class="alert alert-block alert-warning">
    <h2>  Download and set stopwords </h2>
    <p> # Ensure you have downloaded the stopwords
<br>import nltk
<br>nltk.download('stopwords')
<br>from nltk.corpus import stopwords

# Set the stopwords for English
stop_words = set(stopwords.words('english'))</p>
</div>

In [6]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ABC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<div class="alert alert-block alert-warning">
    <h2> Define function to clean the text </h2>
</div>

In [7]:
# Function to clean the text
def clean_text(text):
    # Remove symbols and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Remove stop words
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Clean the 'Tweet' column
data['Tweet'] = data['Tweet'].apply(clean_text)

In [8]:
data.Tweet

0                                   need new outlet rage
1      arguing people doesnt work anyway threaten put...
2      sister throws legitimate temper tantrums get a...
3      hrmanagement must discourage expediency factor...
4                 fucking bright amp fucking hot outside
5      dont join btcare put phone talk rude taking mo...
6      dont offendednim something thatngives life hur...
7      azerbaijan baku azerbaijan prevent another arm...
8      strike upon thee great vengeance furious anger...
9      hey papajohnsuk ive charged credit card order ...
10                  quite sure craig gordons stayed park
11     one month til someones bday think time flaunt ...
12     shriekfest lining volunteers oct serious inqui...
13     makes things easier compact less fiery burdene...
14     stutteringgiant least character fast furious p...
15     rickygervais first time slough checked new sta...
16     ruthwalford may right since year bad events be...
17     new job training much me

<div class="alert alert-block alert-warning">
    <h2> Drop NaN rows </h2>
</div>

In [9]:
# Sample data without Droping Rows with NAN Values
print(len(data))

110


In [10]:
print(data.isna().sum())

Tweet           0
anger           0
anticipation    0
disgust         0
fear            0
joy             0
love            0
optimism        0
pessimism       0
sadness         0
surprise        0
trust           0
dtype: int64


In [11]:
# Drop rows with NaN values in the text column
data = data.dropna(subset=['Tweet'])

In [12]:
# Sample data After Droping Rows with NAN Values
print(len(data))

110


In [13]:
print(data.isna().sum())

Tweet           0
anger           0
anticipation    0
disgust         0
fear            0
joy             0
love            0
optimism        0
pessimism       0
sadness         0
surprise        0
trust           0
dtype: int64


<div class="alert alert-block alert-warning">
    <h2> Data After Processing </h2>
</div>

In [14]:
print("\n\nEmotion Prediction Data After Preprocessing:")
print("=================================================\n")
pd.set_option("display.max_rows", None, "display.max_columns", None)

print(data.head())
print(data.tail())



Emotion Prediction Data After Preprocessing:

                                               Tweet  anger  anticipation  \
0                               need new outlet rage      1             0   
1  arguing people doesnt work anyway threaten put...      1             0   
2  sister throws legitimate temper tantrums get a...      1             0   
3  hrmanagement must discourage expediency factor...      1             0   
4             fucking bright amp fucking hot outside      1             0   

   disgust  fear  joy  love  optimism  pessimism  sadness  surprise  trust  
0        1     0    0     0         0          0        0         0      0  
1        1     1    0     0         0          0        0         0      0  
2        1     0    0     0         0          0        0         0      0  
3        1     0    0     0         1          0        0         0      0  
4        1     0    0     0         0          0        0         0      0  
                           

In [15]:
data

Unnamed: 0,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,need new outlet rage,1,0,1,0,0,0,0,0,0,0,0
1,arguing people doesnt work anyway threaten put...,1,0,1,1,0,0,0,0,0,0,0
2,sister throws legitimate temper tantrums get a...,1,0,1,0,0,0,0,0,0,0,0
3,hrmanagement must discourage expediency factor...,1,0,1,0,0,0,1,0,0,0,0
4,fucking bright amp fucking hot outside,1,0,1,0,0,0,0,0,0,0,0
5,dont join btcare put phone talk rude taking mo...,1,0,1,0,0,0,0,0,0,0,0
6,dont offendednim something thatngives life hur...,1,0,1,0,0,0,0,0,1,0,0
7,azerbaijan baku azerbaijan prevent another arm...,1,0,0,0,0,0,0,0,0,0,0
8,strike upon thee great vengeance furious anger...,1,0,1,0,0,0,0,0,0,0,0
9,hey papajohnsuk ive charged credit card order ...,1,0,1,0,0,0,0,0,1,0,0


<div class="alert alert-block alert-warning">
    <h2> Saving Cleaned Data </h2>
</div>

In [16]:
data.to_csv("cleaned_tweets_emotion_data.csv", index=False)

In [17]:
data.columns

Index(['Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love',
       'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
      dtype='object')

<div class="alert alert-block alert-info">
    <h1> Splitting into Training and Testing </h1>
</div>

In [18]:
data = pd.read_csv('cleaned_tweets_emotion_data.csv')
X = data['Tweet']
y = data.drop(columns=['Tweet'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
print(len(X_train))
print(len(X_test))

88
22


In [20]:
X_train.head()

65    idinamenzel says shes releasing idinaparty tic...
26    thomeagle help maintain boost status world cla...
22    farting hot car windows dont roll terrible gag...
31    killed spider big sprayed spider guts like hor...
47                                 backed pats pleasing
Name: Tweet, dtype: object

In [21]:
X_test.head()

78                                feel really sad today
10                 quite sure craig gordons stayed park
4                fucking bright amp fucking hot outside
84                    hannah hannah stop mournful chill
64    complained head called despair gods mercy sins...
Name: Tweet, dtype: object

In [22]:
y_train.head()

Unnamed: 0,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
65,0,1,0,0,1,0,1,0,0,0,1
26,1,0,1,0,0,0,0,0,1,0,0
22,1,0,1,0,0,0,0,0,1,0,0
31,1,1,1,1,0,0,0,1,0,0,0
47,0,1,0,0,1,0,1,0,0,0,1


In [23]:
y_test.head()

Unnamed: 0,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
78,0,0,0,0,0,0,0,1,1,0,0
10,0,1,0,0,0,0,0,0,0,0,0
4,1,0,1,0,0,0,0,0,0,0,0
84,0,0,1,0,0,0,0,0,1,0,1
64,0,0,0,0,0,0,1,0,0,0,0


In [24]:
# Check the shapes to ensure correct splits
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (88,)
X_test shape: (22,)
y_train shape: (88, 11)
y_test shape: (22, 11)


<div class="alert alert-block alert-warning">
    <h2> Feature Extraction </h2>
</div>

In [25]:
# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1000)

In [26]:
# Fit the vectorizer on the training data
vectorizer.fit(X_train)

In [27]:
# Transform the training data
X_train_tfidf = vectorizer.transform(X_train)

In [28]:
# Convert the TF-IDF sparse matrix to a DataFrame
X_train_tfidf_df = pd.DataFrame(X_train_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

print("\nTF-IDF Features DataFrame:")
print(X_train_tfidf_df.head())


TF-IDF Features DataFrame:
   absolutely  acc  accept  act  adorable  adrenaline  adventuretweets  \
0         0.0  0.0     0.0  0.0       0.0         0.0              0.0   
1         0.0  0.0     0.0  0.0       0.0         0.0              0.0   
2         0.0  0.0     0.0  0.0       0.0         0.0              0.0   
3         0.0  0.0     0.0  0.0       0.0         0.0              0.0   
4         0.0  0.0     0.0  0.0       0.0         0.0              0.0   

   afraid  africanamericannnno  afternoon  agreed  aimisyafiqahr     alarm  \
0     0.0                  0.0        0.0     0.0            0.0  0.306051   
1     0.0                  0.0        0.0     0.0            0.0  0.000000   
2     0.0                  0.0        0.0     0.0            0.0  0.000000   
3     0.0                  0.0        0.0     0.0            0.0  0.000000   
4     0.0                  0.0        0.0     0.0            0.0  0.000000   

   alaskagurus  also  alternate  amazing  amerikkka  amp  

In [29]:
# Transform the training data
X_test_tfidf = vectorizer.transform(X_test)

# Convert the TF-IDF sparse matrix to a DataFrame
X_test_tfidf_df = pd.DataFrame(X_test_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

print("\nTF-IDF Features DataFrame:")
print(X_test_tfidf_df.head())


TF-IDF Features DataFrame:
   absolutely  acc  accept  act  adorable  adrenaline  adventuretweets  \
0         0.0  0.0     0.0  0.0       0.0         0.0              0.0   
1         0.0  0.0     0.0  0.0       0.0         0.0              0.0   
2         0.0  0.0     0.0  0.0       0.0         0.0              0.0   
3         0.0  0.0     0.0  0.0       0.0         0.0              0.0   
4         0.0  0.0     0.0  0.0       0.0         0.0              0.0   

   afraid  africanamericannnno  afternoon  agreed  aimisyafiqahr  alarm  \
0     0.0                  0.0        0.0     0.0            0.0    0.0   
1     0.0                  0.0        0.0     0.0            0.0    0.0   
2     0.0                  0.0        0.0     0.0            0.0    0.0   
3     0.0                  0.0        0.0     0.0            0.0    0.0   
4     0.0                  0.0        0.0     0.0            0.0    0.0   

   alaskagurus  also  alternate  amazing  amerikkka       amp  anger  angry 

<div class="alert alert-block alert-info">
    <h1> Training Phase </h1>
</div>

In [30]:
# Training the model
model = MultiOutputClassifier(LogisticRegression())
model.fit(X_train_tfidf, y_train)

In [31]:
# Save the model to disk
joblib.dump(model, 'multi_label_model.pkl')

# Save the vectorizer to disk
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

<div class="alert alert-block alert-info">
    <h1> Testing Phase </h1>
</div>

In [32]:
import joblib

# Load the model from disk
loaded_model = joblib.load('multi_label_model.pkl')

# Load the vectorizer from disk
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')

print(f"Model loaded from 'multi_label_model.pkl'")

Model loaded from 'multi_label_model.pkl'


In [33]:
# Transform the test data using the loaded vectorizer
X_test_tfidf_loaded = loaded_vectorizer.transform(X_test)

# Evaluate the loaded model
y_pred_loaded = loaded_model.predict(X_test_tfidf_loaded)
accuracy_loaded = accuracy_score(y_test, y_pred_loaded)
report_loaded = classification_report(y_test, y_pred_loaded)

print(f"Accuracy: {accuracy_loaded}")
print("Classification Report:")
print(report_loaded)

Accuracy: 0.0
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.00      0.00      0.00         5
           2       0.00      0.00      0.00         8
           3       0.00      0.00      0.00         4
           4       0.33      0.11      0.17         9
           5       0.00      0.00      0.00         3
           6       0.00      0.00      0.00         7
           7       0.00      0.00      0.00         4
           8       0.00      0.00      0.00         7
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         5

   micro avg       0.25      0.02      0.03        60
   macro avg       0.03      0.01      0.02        60
weighted avg       0.05      0.02      0.03        60
 samples avg       0.05      0.02      0.02        60



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


<div class="alert alert-block alert-info">
    <h1> Application Phase </h1>
</div>

In [34]:
# Take user input
user_input = input("Please enter your text: ").strip()

# Preprocess the user input
def preprocess_user_input(text):
    stop_words = set(stopwords.words('english'))
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

cleaned_input = preprocess_user_input(user_input)

Please enter your text: it was a very bad experience with them


In [35]:
# Transform the cleaned input using the vectorizer
user_input_tfidf = loaded_vectorizer.transform([cleaned_input])

# Predict the emotion of the user input
user_prediction = loaded_model.predict(user_input_tfidf)

# Output the prediction
print(f"The emotion of '{user_input}' is: {user_prediction[0]}")

The emotion of 'it was a very bad experience with them' is: [0 0 0 0 0 0 0 0 0 0 0]
