# Workshop 04

- Name: Ran Arino
- Student ID: 153073200
- Email: rarino@myseneca.ca
- Course: Social Media Analytics
- Course ID: BDA600NAA.07578.2241
- Professor: Dr. Pantea Koochemeshkian

In [3]:
import pandas as pd
import numpy as np
import re
import statistics

import emoji
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer


In [4]:
# load dataset

# for training
data = pd.read_excel('data/TweetEmotionDataset.xlsx', header=None)
data = data.rename(columns={0: "tweets", 1: "emotion"})
data.head()

Unnamed: 0,tweets,emotion
0,@ZubairSabirPTI pls dont insult the word 'Molna',anger
1,@ArcticFantasy I would have almost took offens...,anger
2,@IllinoisLoyalty that Rutgers game was an abom...,anger
3,@CozanGaming that's what lisa asked before she...,anger
4,Sometimes I get mad over something so minuscul...,anger


In [5]:
# for testing
testing = pd.read_excel('data/test_dataset.xlsx', header=None)
testing = testing.rename(columns={0: "tweets", 1: "emotion"})
testing.head()

Unnamed: 0,tweets,emotion
0,At the point today where if someone says somet...,?
1,@CorningFootball IT'S GAME DAY!!!! T MIN...,?
2,This game has pissed me off more than any othe...,?
3,@spamvicious I've just found out it's Candice ...,?
4,@moocowward @mrsajhargreaves @Melly77 @GaryBar...,?


In [6]:
# cleaning the texts
def clean_texts(raw_texts: list or np.array):
    # define result
    result = []

    # set of stopwords
    stop_words = set(stopwords.words('english'))
    # initialize tweet tokenizer
    tweet_tokenizer = TweetTokenizer()
    # set the lemmatizer
    lemmatizer = WordNetLemmatizer()

    # function to convert emojis to text
    def convert_emojis(text):
        return emoji.demojize(text, delimiters=("", ""))
    
    # function to get the wordnet pos
    def get_wordnet_pos(tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        else:
            # Default to noun
            return wordnet.NOUN

    # traversing all sentences
    for sent in raw_texts:
        # (1): white space removal
        sent = sent.strip()
        # (2): URL removal
        sent = re.sub(r"http[s]?://[\w?\W?]+", '', sent)
        # (3): HTML tag removal
        sent = re.sub(r'<[^>]+>', '', sent)
        # (4): Repeated words (at least four times)
        sent = re.sub(r'(.)\1{4,}', r'\1', sent)
        # (5): split attached words (at least two characters and follow the capitalized word)
        sent = re.sub(r"([\w]{2,})([A-Z])", r"\1 \2", sent)
        # (6): Punctuation removal
        sent = re.sub(r'[^\w\s]', '', sent)
        # (7): Emoji to text
        sent = convert_emojis(sent)
        # (8): lemmatizaiton & tokenization
        token = tweet_tokenizer.tokenize(sent)
        tagged_token = nltk.pos_tag(token)
        lemma_token = [
            lemmatizer.lemmatize(w[0], get_wordnet_pos(w[1]))
            for w in tagged_token
            # # adjective ('JJ', 'JJR', 'JJS'), noun ('NN', 'NNP'), verb('VB', 'VBD', 'VBG', 'VBN', 'VBP')
            if w[1] in ['JJ', 'JJR', 'JJS', 'NN', 'VB', 'VBD', 'VBG', 'VBN', 'VBP']
        ]
        # defined cleaned sentence
        clean_sent = ''
    
        # cleaning each sentence
        for w in lemma_token:
            # if 'w' is one of stop words, skip to the next word
            if w.lower() in stop_words:
                continue
            # add words
            clean_sent += w.lower() + ' '

        # add clean_sent to result (make sure that the last item is always blank)
        result += [clean_sent[:-1]]

    return result

# add clean text to the dataset
data.loc[:, 'clean_text'] = clean_texts(np.array(data['tweets'].values))
testing.loc[:, 'clean_text'] = clean_texts(np.array(testing['tweets'].values))
data.head()

Unnamed: 0,tweets,emotion,clean_text
0,@ZubairSabirPTI pls dont insult the word 'Molna',anger,pls dont insult word
1,@ArcticFantasy I would have almost took offens...,anger,take offense snap
2,@IllinoisLoyalty that Rutgers game was an abom...,anger,game abomination affront man speak
3,@CozanGaming that's what lisa asked before she...,anger,ask start rag call heh
4,Sometimes I get mad over something so minuscul...,anger,get mad something minuscule try ruin life lose...


In [7]:
# create new columns with binary values
for emotion in np.unique(data['emotion']):
    data[emotion] = data['emotion'].apply(lambda x: 1 if x == emotion else 0)

data.head()

Unnamed: 0,tweets,emotion,clean_text,anger,fear,joy,sadness
0,@ZubairSabirPTI pls dont insult the word 'Molna',anger,pls dont insult word,1,0,0,0
1,@ArcticFantasy I would have almost took offens...,anger,take offense snap,1,0,0,0
2,@IllinoisLoyalty that Rutgers game was an abom...,anger,game abomination affront man speak,1,0,0,0
3,@CozanGaming that's what lisa asked before she...,anger,ask start rag call heh,1,0,0,0
4,Sometimes I get mad over something so minuscul...,anger,get mad something minuscule try ruin life lose...,1,0,0,0


### Model creation

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.utils.class_weight import compute_class_weight
from keras import Sequential
from keras.layers import Dense




In [50]:
# apply tf-idf
# create TF-IDF vectrizer
tfidf_vect = TfidfVectorizer()
# fit and transform data
matrix = tfidf_vect.fit_transform(np.array(data.loc[:, 'clean_text']))

# get the explanatory and target variables for machine learning
X = matrix.toarray()
# assign the emotion that we wanna predict here; in this case "fear"
y = np.array(data['fear'])


print("Explnatory Variable format: ")
print(X[:5])

print("\nTarget Variable format: ")
print(y[:5])


Explnatory Variable format: 
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

Target Variable format: 
[0 0 0 0 0]


In [51]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state=10)

# class weights
classes = np.unique(y_train)
class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weight_dict = {k: v for k, v in zip(classes, class_weights)}

#### Neural Network

In [54]:
# Neural Network Model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid')) 
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Train the model
model.fit(X_train, y_train, epochs=15, batch_size=16, validation_data=(X_test, y_test), class_weight=class_weight_dict)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x22a8b340390>

In [55]:
y_pred = (model.predict(X_test) >= 0.5).astype(int).flatten()
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91        48
           1       0.81      0.77      0.79        22

    accuracy                           0.87        70
   macro avg       0.85      0.84      0.85        70
weighted avg       0.87      0.87      0.87        70



In [63]:
"""# save model
import joblib
joblib.dump(model, 'WS04_Neuraletwork.joblib')"""

['WS04_Neuraletwork.joblib']

In [68]:
# apply the model for testing data
testing_matrix = tfidf_vect.transform(np.array(testing.loc[:, 'clean_text']))
testing_data = testing_matrix.toarray()

# generate prediction
y_pred_testing = (model.predict(testing_data) >= 0.5).astype(int).flatten()
y_pred_testing



array([0, 0, 0, ..., 1, 1, 0])

In [76]:
# writing down the predicted label in TXT format

with open("WS04_prediction.txt", 'w') as f:
    f.write("\n".join(np.where(y_pred_testing == 1, 'fear', 'other')))
