In [3]:
import json
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [4]:
# Read data
data = []
with open('dm-2024-isa-5810-lab-2-homework/tweets_DM.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))
 
f.close()
emotion = pd.read_csv('dm-2024-isa-5810-lab-2-homework/emotion.csv')
data_identification = pd.read_csv('dm-2024-isa-5810-lab-2-homework/data_identification.csv')

In [5]:
# Filter out the 'train' data
df = pd.DataFrame(data)
_source = df['_source'].apply(lambda x: x['tweet'])
df = pd.DataFrame({
    'tweet_id': _source.apply(lambda x: x['tweet_id']),
    'hashtags': _source.apply(lambda x: x['hashtags']),
    'text': _source.apply(lambda x: x['text']),
})
df = df.merge(data_identification, on='tweet_id', how='left')

train_data = df[df['identification'] == 'train']

In [6]:
train_data = train_data.merge(emotion, on='tweet_id', how='left') # Merge emotion for corresponding tweet_id
train_data.drop_duplicates(subset=['text'], keep=False, inplace=True) # Remove duplication


In [7]:
train_data_sample = train_data.sample(frac=0.2) # Get sample

In [8]:
train_data_sample

Unnamed: 0,tweet_id,hashtags,text,identification,emotion
1132646,0x1dd6e6,[],@potus Damn those nasty and lazy people of Pue...,train,sadness
122122,0x270931,[],<LH> destroys your joy and peace.,train,anger
613705,0x25d96e,[],im <LH> go see Groundhog Day!!!!! it deserves ...,train,anger
926163,0x22eafd,[],"Music is what I breath, what I love to do. It ...",train,joy
247701,0x1e1b87,[],".@brookesimpson Looking and sounding great, Br...",train,anticipation
...,...,...,...,...,...
566631,0x2a8475,[],<LH> Christ’s love control us... 2 Cor. 5:14,train,anticipation
229858,0x2609e1,[Stupid],@jazznatural @DraftExpress That’s BS. Read his...,train,disgust
482481,0x1d1c96,[],<LH> tweeting smarter politics dangerous game ...,train,trust
385081,0x2a5213,[],"""Joy is our strength and actually energizes us...",train,joy


In [9]:
y_train_data = train_data_sample['emotion']
X_train_data = train_data_sample.drop(['tweet_id', 'emotion', 'identification'], axis=1)
X_train_data = X_train_data['text'] + ' ' + X_train_data['hashtags'].apply(lambda x: ' '.join(x)) # Combine text and hashtags
X_train_data

1132646    @potus Damn those nasty and lazy people of Pue...
122122                    <LH> destroys your joy and peace. 
613705     im <LH> go see Groundhog Day!!!!! it deserves ...
926163     Music is what I breath, what I love to do. It ...
247701     .@brookesimpson Looking and sounding great, Br...
                                 ...                        
566631         <LH> Christ’s love control us... 2 Cor. 5:14 
229858     @jazznatural @DraftExpress That’s BS. Read his...
482481     <LH> tweeting smarter politics dangerous game ...
385081     "Joy is our strength and actually energizes us...
6216                           <LH> daughter <LH> Birthday! 
Length: 289836, dtype: object

In [10]:
y_train_data

1132646         sadness
122122            anger
613705            anger
926163              joy
247701     anticipation
               ...     
566631     anticipation
229858          disgust
482481            trust
385081              joy
6216                joy
Name: emotion, Length: 289836, dtype: object

In [11]:
filtered_X_train_data = []
filtered_y_train_data = []

for text, emotion in zip(X_train_data, y_train_data):
    if text:  # only keep non-empty text
        filtered_X_train_data.append(text)
        filtered_y_train_data.append(emotion)

print(pd.DataFrame(filtered_X_train_data))
print(pd.DataFrame(filtered_y_train_data))

X_train_data = filtered_X_train_data
y_train_data = filtered_y_train_data

                                                        0
0       @potus Damn those nasty and lazy people of Pue...
1                      <LH> destroys your joy and peace. 
2       im <LH> go see Groundhog Day!!!!! it deserves ...
3       Music is what I breath, what I love to do. It ...
4       .@brookesimpson Looking and sounding great, Br...
...                                                   ...
289831      <LH> Christ’s love control us... 2 Cor. 5:14 
289832  @jazznatural @DraftExpress That’s BS. Read his...
289833  <LH> tweeting smarter politics dangerous game ...
289834  "Joy is our strength and actually energizes us...
289835                      <LH> daughter <LH> Birthday! 

[289836 rows x 1 columns]
                   0
0            sadness
1              anger
2              anger
3                joy
4       anticipation
...              ...
289831  anticipation
289832       disgust
289833         trust
289834           joy
289835           joy

[289836 rows x 1 columns

In [12]:
print(pd.DataFrame(y_train_data))

                   0
0            sadness
1              anger
2              anger
3                joy
4       anticipation
...              ...
289831  anticipation
289832       disgust
289833         trust
289834           joy
289835           joy

[289836 rows x 1 columns]


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_train_data, y_train_data, test_size=0.2, random_state=42, stratify=y_train_data
)

In [None]:
tfidf = TfidfVectorizer(max_features=5000) # Use tfidfVectorizer and remove stop_words.
vec_X = tfidf.fit_transform(X_train).toarray()
vec_X_val= tfidf.transform(X_val)

In [None]:
le = LabelEncoder() # Label target
vec_y = le.fit_transform(y_train)
vec_y_val = le.transform(y_val)

In [16]:
clf = RandomForestClassifier() # Use RandomForest model
clf.fit(vec_X, vec_y)
model = clf
clf.fit(X_train, y_train)
y_pred = model.predict(X_val) # Predict

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
print(accuracy_score(y_val, y_pred)) # Evaluation
print(recall_score(y_val, y_pred, average='macro')) # Evaluation
print(precision_score(y_val, y_pred, average='macro')) # Evaluation
print(f1_score(y_val, y_pred, average='macro')) # Evaluation

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_val, y_pred) # Evaluation

In [None]:
test_data = df[df['identification'] == 'test'] # Get the test data from df
# test_data.to_csv('dm-2024-isa-5810-lab-2-homework/test000.csv', index=False)

In [None]:
# Do the same thing as training stage, but here we don't have emotions feature.
X_test_data = test_data.drop(['tweet_id', 'identification'], axis=1)
X_test_data = X_test_data['text'] + ' ' + X_test_data['hashtags'].apply(lambda x: ' '.join(x))

In [None]:
X_test_data = tfidf.transform(X_test_data).toarray() # Convert test data by using same tfidfVectorizer
y_test_pred = model.predict(X_test_data)
y_pred_labels = le.inverse_transform(y_test_pred) # Inverse predict labels back to adjective words
submission = pd.DataFrame({
    'id': test_data['tweet_id'],
    'emotion': y_pred_labels
})
submission

In [None]:
submission.to_csv('dm-2024-isa-5810-lab-2-homework/submission4.csv', index=False)