In [1]:
import json
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
# Read data
# 分別使用陣列和DataFrame讀取.json檔和.csv檔。
data = []
with open('dm-2024-isa-5810-lab-2-homework/tweets_DM.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))
 
f.close()
emotion = pd.read_csv('dm-2024-isa-5810-lab-2-homework/emotion.csv')
data_identification = pd.read_csv('dm-2024-isa-5810-lab-2-homework/data_identification.csv')

In [None]:
# 找出train的資料
df = pd.DataFrame(data)
_source = df['_source'].apply(lambda x: x['tweet'])
df = pd.DataFrame({
    'tweet_id': _source.apply(lambda x: x['tweet_id']),
    'hashtags': _source.apply(lambda x: x['hashtags']),
    'text': _source.apply(lambda x: x['text']),
})
df = df.merge(data_identification, on='tweet_id', how='left')

train_data = df[df['identification'] == 'train']

In [None]:
train_data = train_data.merge(emotion, on='tweet_id', how='left')
train_data.drop_duplicates(subset=['text'], keep=False, inplace=True) # 去除空白值

In [None]:
train_data_sample = train_data.sample(frac=0.2) # 採樣，採樣率0.2

In [6]:
train_data_sample

Unnamed: 0,tweet_id,hashtags,text,identification,emotion
426473,0x1f3e97,[],Seriously awesome turn out for a on the fly st...,train,joy
148135,0x1ed774,[],I see god working his ways <LH>,train,joy
965640,0x32f256,[],<LH> is my favourite color 😍 <LH> ♥,train,sadness
575587,0x32e980,[],"Because of your smile, you make life more beau...",train,joy
820293,0x226262,"[DestinysConflict, preorderhardcover]",The twitching has begun. <LH> #DestinysConflic...,train,anticipation
...,...,...,...,...,...
477320,0x37135b,[],Two more from Arthur Lee the magician A house ...,train,joy
274384,0x2de667,[],"Sometimes a good, is equally true that, if kno...",train,joy
544178,0x224f0c,[],When it comes to guy bashing tons of women wil...,train,disgust
1417022,0x200386,[],@TeamTrump Proverbs 16:4 The LORD works out ev...,train,anticipation


In [None]:
y_train_data = train_data_sample['emotion']
X_train_data = train_data_sample.drop(['tweet_id', 'emotion', 'identification'], axis=1)
X_train_data = X_train_data['text'] + ' ' + X_train_data['hashtags'].apply(lambda x: ' '.join(x)) # 將hashtags和text合成
X_train_data

426473     Seriously awesome turn out for a on the fly st...
148135                      I see god working his ways <LH> 
965640                  <LH> is my favourite color 😍 <LH> ♥ 
575587     Because of your smile, you make life more beau...
820293     The twitching has begun. <LH> #DestinysConflic...
                                 ...                        
477320     Two more from Arthur Lee the magician A house ...
274384     Sometimes a good, is equally true that, if kno...
544178     When it comes to guy bashing tons of women wil...
1417022    @TeamTrump Proverbs 16:4 The LORD works out ev...
1397254    @kenziebatcho @realDonaldTrump his #heart, “Th...
Length: 289836, dtype: object

In [8]:
y_train_data

426473              joy
148135              joy
965640          sadness
575587              joy
820293     anticipation
               ...     
477320              joy
274384              joy
544178          disgust
1417022    anticipation
1397254    anticipation
Name: emotion, Length: 289836, dtype: object

In [None]:
filtered_X_train_data = []
filtered_y_train_data = []

for text, emotion in zip(X_train_data, y_train_data):
    if text:  # 只保留非空值的部分
        filtered_X_train_data.append(text)
        filtered_y_train_data.append(emotion)

print(pd.DataFrame(filtered_X_train_data))
print(pd.DataFrame(filtered_y_train_data))

X_train_data = filtered_X_train_data
y_train_data = filtered_y_train_data

                                                        0
0       Seriously awesome turn out for a on the fly st...
1                        I see god working his ways <LH> 
2                    <LH> is my favourite color 😍 <LH> ♥ 
3       Because of your smile, you make life more beau...
4       The twitching has begun. <LH> #DestinysConflic...
...                                                   ...
289831  Two more from Arthur Lee the magician A house ...
289832  Sometimes a good, is equally true that, if kno...
289833  When it comes to guy bashing tons of women wil...
289834  @TeamTrump Proverbs 16:4 The LORD works out ev...
289835  @kenziebatcho @realDonaldTrump his #heart, “Th...

[289836 rows x 1 columns]
                   0
0                joy
1                joy
2            sadness
3                joy
4       anticipation
...              ...
289831           joy
289832           joy
289833       disgust
289834  anticipation
289835  anticipation

[289836 rows x 1 columns

In [10]:
print(pd.DataFrame(y_train_data))

                   0
0                joy
1                joy
2            sadness
3                joy
4       anticipation
...              ...
289831           joy
289832           joy
289833       disgust
289834  anticipation
289835  anticipation

[289836 rows x 1 columns]


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_train_data, y_train_data, test_size=0.2, random_state=42, stratify=y_train_data
)
# 將資料分成訓練和測試，比例為0,2

In [None]:
tfidf = TfidfVectorizer(max_features=5000) # 使用TFIDF處理，最大特徵取5000
X = tfidf.fit_transform(X_train).toarray()
X_test = tfidf.transform(X_test)

In [None]:
le = LabelEncoder() # 標記轉成數字減少資料量
y = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [None]:
rf_model = RandomForestClassifier() #使用隨機森林分類
model = rf_model.fit(X, y)

In [None]:
y_pred = model.predict(X_test) # 預測

In [None]:
# 查看各種指標
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
print(accuracy_score(y_test, y_pred))
print(recall_score(y_test, y_pred, average='macro'))
print(precision_score(y_test, y_pred, average='macro'))
print(f1_score(y_test, y_pred, average='macro'))

0.5178029257521392
0.34494896223037597
0.6594764607895148
0.39697488469335285


In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.5178029257521392

In [None]:
test_data = df[df['identification'] == 'test'] # 讀取test data
# test_data.to_csv('dm-2024-isa-5810-lab-2-homework/test000.csv', index=False)

In [None]:
# 和訓練集一樣的前處理
X_test_data = test_data.drop(['tweet_id', 'identification'], axis=1)
X_test_data = X_test_data['text'] + ' ' + X_test_data['hashtags'].apply(lambda x: ' '.join(x))

In [None]:
X_test_data = tfidf.transform(X_test_data).toarray()
y_test_pred = model.predict(X_test_data)
y_pred_labels = le.inverse_transform(y_test_pred)
submission = pd.DataFrame({
    'id': test_data['tweet_id'],
    'emotion': y_pred_labels
})
submission.to_csv('dm-2024-isa-5810-lab-2-homework/submission_kaggle_v1.csv', index=False)

In [21]:
submission

Unnamed: 0,id,emotion
2,0x28b412,anticipation
4,0x2de201,anticipation
9,0x218443,joy
30,0x2939d5,joy
33,0x26289a,trust
...,...,...
1867525,0x2913b4,joy
1867529,0x2a980e,joy
1867530,0x316b80,joy
1867531,0x29d0cb,joy
