In [1]:
import nltk
import spacy
from nltk.corpus import wordnet
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/ruletka/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ruletka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ruletka/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
stop_words = set(stopwords.words('english'))
stop_words.remove("not")
lemmatizer = WordNetLemmatizer()

def get_synonyms_and_antonyms(word):
    synonyms = []
    antonyms = []

    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonyms.append(l.name())
            if l.antonyms():
                antonyms.append(l.antonyms()[0].name())
    
    return set(synonyms), set(antonyms)


def preprocess_text(text):
    #words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(text) if word.isalnum() and word.lower() not in stop_words]
    #words = text.split(" ")
    words = []
    doc = nlp(text)

    for tok in doc:
        pos = spacy.explain(tok.pos_)
        dep = tok.dep_

        if pos == "noun" or pos == "verb" or pos == "adjective" or pos == "adverb" or dep == "neg":
            words.append(tok.text)

    '''actual_words = []

    for i in range(len(words) - 1):
        word = None
        if words[i] == "not":
            _, ants = get_synonyms_and_antonyms(words[i + 1])
            ants_as_list = list(ants)

            #word = "not " + words[i + 1]
            if len(ants_as_list) > 0:
                word = list(ants)[0]
            else:
                word = "not " + words[i + 1]
        elif i > 0 and words[i - 1] == "not":
            continue
        else:
            word = words[i]
        
        actual_words.append(word)
    actual_words.append(words[-1])'''

    #return ' '.join(words)
    return words

In [3]:
preprocess_text("im not sure which adventure made me feel more nervous")

['m', 'not', 'sure', 'adventure', 'made', 'feel', 'more', 'nervous']

In [4]:
value_counts = {'sadness': 581, 'joy': 695, 'love': 159, 'anger': 275, 'fear': 224, 'surprise': 66}

In [68]:
class GraphClassifier:
    def __init__(self):
        self.triplets = []
        self.labels = None
        self.labels_count = 0
        self.triplets_count = {}
        self.graph = {}
    

    def add_triplet(self, source, relation, target, weight):
        #if [source, relation, target] not in self.triplets:
        #if source != target:
        if self.triplets_count.get((source, relation, target)) is None:
            self.triplets_count[(source, relation, target)] = 1 / weight
        else:
            self.triplets_count[(source, relation, target)] += 1 / weight
        self.triplets.append((source, relation, target))
        #print((source, relation, target))

    
    def fit(self, data, labels, unique_labels, data_size):
        self.labels_count = len(unique_labels)
        self.labels = unique_labels

        for i in range(len(data)):
            words = preprocess_text(data[i])

            for j in range(len(words)):
                self.add_triplet(words[j], "expresses", labels[i], 1)

                for k in range(len(words)):
                    if k != j:
                        self.add_triplet(words[j], "means " + labels[i] + " with", words[k], 1)
                        self.add_triplet(words[k], "means " + labels[i] + " with", words[j], 1)

                '''for syn in iter(synonyms):
                    self.add_triplet(w, "synonymOf", syn, 1)
                    self.add_triplet(syn, "synonymOf", w, 1)
                
                for ant in iter(antonyms):
                    self.add_triplet(w, "antonymOf", ant, 1)
                    self.add_triplet(ant, "antonymOf", w, 1)'''
        
        unique_triplets = set(self.triplets)
        actual_triplets = []
        for tr in iter(unique_triplets):
            if self.graph.get(tr[0]) == None:
                self.graph[tr[0]] = {}
            
            if self.graph[tr[0]].get(tr[1]) == None:
                self.graph[tr[0]][tr[1]] = {}
            
            if self.graph[tr[0]][tr[1]].get(tr[2]) == None:
                self.graph[tr[0]][tr[1]][tr[2]] = self.triplets_count[tr]
            else:
                self.graph[tr[0]][tr[1]][tr[2]] += self.triplets_count[tr]
            
            #new_tripl = (tr[0], tr[1], tr[2], self.triplets_count[tr])
            #actual_triplets.append(new_tripl)
            '''if tr[1] == "expresses":
                new_tripl = (tr[0], tr[1], tr[2], self.triplets_count[tr])
                actual_triplets.append(new_tripl)
            else:
                actual_triplets.append((tr[0], tr[1], tr[2], 0))'''
        
        self.triplets = actual_triplets
    

    def get_word2word_score(self, word1_idx, words, emotion_idx):
        all_emos = [0] * len(self.labels)
        
        for k in range(len(self.labels)):
            res = []

            for j in range(len(words)):
                if j != word1_idx and self.graph.get(words[word1_idx]) != None and self.graph[words[word1_idx]].get("means " + self.labels[emotion_idx] + " with") != None and self.graph[words[word1_idx]]["means " + self.labels[emotion_idx] + " with"].get(words[j]) != None:
                    val = self.graph[words[word1_idx]]["means " + self.labels[emotion_idx] + " with"][words[j]]
                    #val = sum([t[3] for t in self.triplets if t[0] == words[word1_idx] and t[1] == "means " + self.labels[emotion_idx] + " with" and t[2] ==words[j]])
                    res.append(val)
            
            if len(res) == 0:
                all_emos[k] = 0.0
            else:
                all_emos[k] = sum(res) / len(res)
        
        if sum(all_emos) == 0:
            return 0.0
        
        return all_emos[emotion_idx] / sum(all_emos)


    def get_score(self, word, emotion):
        occurances = sum([self.graph[word]["expresses"][emo] for emo in self.labels if self.graph.get(word) != None and self.graph[word].get("expresses") != None and self.graph[word]["expresses"].get(emo) != None])
        emotion_word_occurances = self.graph[word]["expresses"][emotion] if self.graph.get(word) != None and self.graph[word].get("expresses") != None and self.graph[word]["expresses"].get(emotion) != None else 0
        #emotion_occurances = len([t for t in self.triplets if t[1] == "expresses" and t[2] == emotion])

        if occurances == 0:
            return 0.0

        #return emotion_word_occurances / occurances + (emotion_word_occurances / emotion_occurances)
        return emotion_word_occurances / occurances


    def predict(self, data):
        emotions_count = [0] * self.labels_count

        words = preprocess_text(data)

        for i in range(len(words)):
            for j in range(len(self.labels)):
                emotion_score = self.get_score(words[i], self.labels[j])
                word2word_score = self.get_word2word_score(i, words, j)
                val = word2word_score + emotion_score
                emotions_count[j] += val
                #res, last_word, _ = self.path_exists(word, self.labels[i], ns)
                #if res == True:
                #    emotions_count[i] += self.get_score(word, self.labels[i])
        
        ind = 0
        max_element = emotions_count[0]

        for i in range (1,len(emotions_count)): #iterate over array
            if emotions_count[i] > max_element: #to check max value
                max_element = emotions_count[i]
                ind = i
        
        return self.labels[ind]

In [4]:
import numpy as np # linear algebra
import pandas as pd

In [5]:
data = pd.read_parquet('train-00000-of-00001.parquet', engine='fastparquet')
#data = pd.read_csv('test.csv')
print(data.head())

                                                text  label
0  i feel awful about it too because it s my job ...      0
1                              im alone i feel awful      0
2  ive probably mentioned this before but i reall...      1
3           i was feeling a little low few days back      0
4  i beleive that i am much more sensitive to oth...      2


In [6]:
emotion_map = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
data['emotion'] = data['label'].map(emotion_map)

# Preview the updated dataframe
print(data[['text', 'emotion']].head())

                                                text  emotion
0  i feel awful about it too because it s my job ...  sadness
1                              im alone i feel awful  sadness
2  ive probably mentioned this before but i reall...      joy
3           i was feeling a little low few days back  sadness
4  i beleive that i am much more sensitive to oth...     love


In [9]:
data['label'].value_counts()

label
1    141067
0    121187
3     57317
4     47712
2     34554
5     14972
Name: count, dtype: int64

In [8]:
unique_labels = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

In [9]:
def create_uniform_sample(data, labels, num):
    dfs = []

    for l in labels:
        emotion_df = data.loc[(data['emotion'] == l)]
        emotion_df = emotion_df.sample(n=num)
        dfs.append(emotion_df)
    
    overall_df = pd.concat(dfs, axis=0)
    overall_df = overall_df.sample(frac=1)

    return overall_df

overall_df = create_uniform_sample(data, unique_labels, 14972)
print(overall_df)
print(overall_df['emotion'].value_counts())

                                                     text  label   emotion
245436  i feel you ll be surprised at how fruit can su...      5  surprise
214306  i got started where i feel most creative and w...      1       joy
177618  i get him to commit if i feel not being faithf...      2      love
200699  i feel cranky or in a bad mood i have the powe...      3     anger
394991           i feel so lucky to have found each other      1       joy
...                                                   ...    ...       ...
145179  i feel vaguely wronged that i probably wont be...      3     anger
256472  i truly feel either as every nerves in my body...      5  surprise
233393  i feel well placed to be able to explain as be...      1       joy
169004        when they threw my holder down on the floor      3     anger
180939  i complain i feel terrible i dont feel worthy ...      0   sadness

[89832 rows x 3 columns]
surprise    14972
joy         14972
love        14972
anger       14972
sa

In [10]:
X = list(overall_df['text'])
y = list(overall_df['emotion'])
y_numeric = list(overall_df['label'])

In [140]:
X

['i feel bitter that organic whole food choices are so difficult for folks who are low income',
 'i feel that longing for one more little person in our fold',
 'i are so excited and feel very blessed lucky what have you',
 'i was feeling weird the other day and it went away about minutes after i took my metformin',
 'i feel useless also cos alot ppl ask me how',
 'i feel it makes them complacent of improving because i just labeled them',
 'i could use the white side of some pretty designed security envelopes a brown paper bag or if i was feeling more delicate than i generally am an old sewing pattern',
 'i put it all back on the very back of my mind kept on traveling making my own plans meeting guys mostly just for fun wondering if i d ever feel impressed by anyone else again and further more if this person worthy of my super selective impressiveness level would ever even look at someone like me',
 'i feel like mitchell and i will never agree on a name for this sweet boy',
 'i will rem

In [11]:
from sklearn.model_selection import train_test_split

data_size = len(X)
train_size = int(data_size * 0.8)
test_size = data_size - train_size

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#X_train = X[:train_size]
#X_test = X[train_size:data_size]
#y_train = y[:train_size]
#y_test = y[train_size:data_size]
#y_numeric_test = y_numeric[train_size:data_size]

In [69]:
clf = GraphClassifier()
clf.fit(X_train, y_train, unique_labels, len(X))

In [None]:
len(X_train)

66582

In [52]:
from tqdm import tqdm

In [70]:
def accuracy_score(classifier, data, y_vals):
    correct_count = 0

    for i in range(len(data)):
        res = classifier.predict(data[i])
        if res == y_vals[i]:
            correct_count += 1
        print(data[i] + " |", y_vals[i] + " |", res)
    
    return correct_count / len(data)

In [15]:
unique_labels

['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

In [192]:
syn, ant = get_synonyms_and_antonyms("come")
print(ant)

{'go', 'leave'}


In [184]:
preprocess_text("i do not come for a long time i feel a longing and need to come")

['go', 'long', 'time', 'feel', 'longing', 'need', 'come']

In [191]:
[clf.get_score("come", s) for s in unique_labels]

[0.3333333333333333,
 0.3333333333333333,
 0.1111111111111111,
 0.0,
 0.0,
 0.2222222222222222]

In [54]:
idx = 3
print(X_test[idx], y_test[idx])
print(clf.predict(X_test[idx]))

i can calm down about feeling funny i can start figuring out the purpose that my self destructive behaviors serve for me surprise
surprise


In [18]:
preprocess_text("i feel so reluctant n lazy to s")

['feel', 'so', 'reluctant', 'lazy']

In [34]:
[clf.get_score('lazy', emo) for emo in unique_labels]

[0.2878787878787879,
 0.16666666666666666,
 0.09090909090909091,
 0.22727272727272727,
 0.13636363636363635,
 0.09090909090909091]

In [63]:
clf.get_word2word_score(0, ['not', 'feel', 'so', 'reluctant', 'lazy'], 3)

604.0

In [73]:
accuracy_score(clf, X_test, y_test)

i feel so reluctant n lazy to s | fear | fear
i was feeling frightened and alone at the time and it seemed to me so strongly yesterday that being granted membership being accepted within this group gave me the confidence i needed to decide finally to go for transition at a time i felt so frightened and alone | fear | fear
i was feeling distracted by the chaos in my flat caused by the arrival of my new cooker at the weekend which was sitting in the living room until it could be installed later in the week | anger | surprise
i can calm down about feeling funny i can start figuring out the purpose that my self destructive behaviors serve for me | surprise | anger
i keep seeing swans showing up more and more in design lately and i have a feeling that my beloved owl is about to be dethroned by these giant white monsters | love | love
i feel the gentle waves of peace flowing across my mind | love | love
i think the honey floral notes and round mouth feel would only enhance the mellow creamin

0.8228975343685646

In [180]:
preprocess_text("i came away feeling more loved and cherished than i have for a very very long time")

['came',
 'away',
 'feeling',
 'more',
 'loved',
 'cherished',
 'have',
 'very',
 'very',
 'long',
 'time']

In [239]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Splitting the data
X = overall_df['text']  # Feature: tweet text
y = overall_df['label']  # Target: emotion labels (0-5)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorizing the text data using TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test[:100])

In [85]:
vectorizer.get_feature_names_out()

array(['able', 'about', 'accept', 'acceptable', 'accepted', 'actually',
       'add', 'admit', 'afraid', 'after', 'afterwards', 'again',
       'against', 'age', 'agitated', 'ago', 'ahead', 'all', 'almost',
       'alone', 'along', 'alot', 'already', 'also', 'although', 'always',
       'am', 'amazed', 'amazing', 'among', 'amount', 'amp', 'an', 'and',
       'anger', 'angry', 'announcements', 'annoyed', 'another', 'answer',
       'anxiety', 'anxious', 'any', 'anymore', 'anyone', 'anything',
       'apprehensive', 'are', 'arent', 'around', 'art', 'as', 'ask',
       'asked', 'asking', 'aspects', 'at', 'away', 'awful', 'baby',
       'back', 'bad', 'be', 'became', 'because', 'become', 'bed', 'been',
       'before', 'began', 'begin', 'beginning', 'behind', 'being',
       'believe', 'belly', 'beloved', 'best', 'betrayed', 'better',
       'between', 'beyond', 'big', 'bills', 'birthday', 'bit', 'bitch',
       'bitchy', 'bitter', 'black', 'blank', 'blessed', 'blog', 'blood',
       'body

In [241]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Train a Multinomial Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test_vectorized)

# Evaluate the model
accuracy = accuracy_score(y_test[:100], y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display the classification report
print(classification_report(y_test[:100], y_pred, target_names=emotion_map.values()))

Accuracy: 0.81
              precision    recall  f1-score   support

     sadness       0.91      0.67      0.77        15
         joy       0.57      0.57      0.57        14
        love       0.70      0.70      0.70        10
       anger       0.95      0.95      0.95        19
        fear       0.83      0.90      0.86        21
    surprise       0.83      0.90      0.86        21

    accuracy                           0.81       100
   macro avg       0.80      0.78      0.79       100
weighted avg       0.81      0.81      0.81       100

