In [1]:
import pandas as pd
import numpy as np
from pandas import json_normalize
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
import keras
from keras.models import Model
from keras.layers import Input, Dense
from keras.layers import ReLU, Softmax
from keras.callbacks import CSVLogger
import gensim
from gensim.models import KeyedVectors
import warnings
warnings.filterwarnings('ignore')
import re



[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Data

In [2]:
data_identification = pd.read_csv("/kaggle/input/dm2023-lab2-data/data_identification.csv")
emotion = pd.read_csv("/kaggle/input/dm2023-lab2-data/emotion.csv")
tweets_data = pd.read_json("/kaggle/input/dm2023-lab2-data/tweets_DM.json", lines=True)

In [3]:
# 取出 train id 以及 test id
train_id = data_identification[data_identification['identification'] == 'train']['tweet_id'].tolist()
test_id = data_identification[data_identification['identification'] == 'test']['tweet_id'].tolist()

In [4]:
# 處理資料
source = json_normalize(tweets_data['_source'])
tweets_data = pd.concat([tweets_data, source], axis=1)
tweets_data.drop(columns=['_source', '_type', '_index'], inplace=True)

In [5]:
# 分出 train data 以及 test data
train_data = tweets_data[tweets_data['tweet.tweet_id'].isin(train_id)]
test_data = tweets_data[tweets_data['tweet.tweet_id'].isin(test_id)]

In [6]:
# 合併 train data 以及 result
merged_train_data = pd.merge(train_data, emotion, left_on='tweet.tweet_id', right_on='tweet_id')

In [7]:
merged_train_data.drop(columns='tweet_id', inplace=True)

In [8]:
data = merged_train_data.copy() # training data
testdata = test_data.copy() # testing data

In [9]:
stop_words = set(stopwords.words('english'))

# 清理文本
def clean_text(text):
    if isinstance(text, str):
        # 去除標點符號和特殊字符
        text = re.sub(r'[^\w\s]', '', text)
        # 轉換為小寫
        text = text.lower()
        # 移除數字
        text = re.sub(r'\d+', '', text)
        # 移除停用詞
        tokens = word_tokenize(text)
        text = ' '.join([word for word in tokens if word not in stop_words])
        return text
    else:
        return ''

# training data
data['clean_text'] = data['tweet.text'].apply(clean_text)
# testing data
testdata['clean_text'] = testdata['tweet.text'].apply(clean_text)

In [10]:
data

Unnamed: 0,_score,_crawldate,tweet.hashtags,tweet.tweet_id,tweet.text,emotion,clean_text
0,391,2015-05-23 11:42:47,[Snapchat],0x376b20,"People who post ""add me on #Snapchat"" must be ...",anticipation,people post add snapchat must dehydrated cuz m...
1,433,2016-01-28 04:52:09,"[freepress, TrumpLegacy, CNN]",0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",sadness,brianklaas see trump dangerous freepress aroun...
2,376,2016-01-24 23:53:05,[],0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,fear,issa stalking tasha lh
3,120,2015-06-11 04:44:05,"[authentic, LaughOutLoud]",0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,joy,riskshow thekevinallison thx best time tonight...
4,1021,2015-08-18 02:30:07,[],0x2c91a8,Still waiting on those supplies Liscus. <LH>,anticipation,still waiting supplies liscus lh
...,...,...,...,...,...,...,...
1455558,94,2016-12-26 02:44:07,"[NoWonder, Happy]",0x321566,I'm SO HAPPY!!! #NoWonder the name of this sho...,joy,im happy nowonder name show happy happysyfy sy...
1455559,627,2015-04-01 08:14:56,[],0x38959e,In every circumtance I'd like to be thankful t...,joy,every circumtance id like thankful almighty je...
1455560,274,2016-11-17 23:46:22,[blessyou],0x2cbca6,there's currently two girls walking around the...,joy,theres currently two girls walking around libr...
1455561,840,2016-09-02 14:25:06,[],0x24faed,"Ah, corporate life, where you can date <LH> us...",joy,ah corporate life date lh using relative anach...


In [11]:
X_train, X_val, y_train, y_val = train_test_split(data['clean_text'], data['emotion'], test_size=0.2, random_state=42)

In [12]:
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

# 初始化 BERT tokenizer 和模型
model_name = 'bert-base-uncased'  # 選擇模型
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=8)

# 編碼文本數據
max_length = 64  # 最大序列長度
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=max_length, return_tensors='tf')
train_encodings = {key: tf.constant(value) for key, value in train_encodings.items()}

label_encoder = LabelEncoder()
train_encoded_labels = label_encoder.fit_transform(y_train)
train_labels = pd.Series(train_encoded_labels) 

# 編譯模型
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# 訓練模型
model.fit(train_encodings, train_labels, epochs=2, batch_size=32)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7e8f0851dc00>

In [13]:
X_sub = testdata['clean_text']
# 做與訓練資料相同的編碼
test_encodings = tokenizer(list(X_sub), truncation=True, padding=True, max_length=max_length, return_tensors='tf')
test_encodings = {key: tf.constant(value) for key, value in test_encodings.items()}

# 進行預測
predictions = model.predict(test_encodings)

# 如果是分類問題，可以透過 argmax 取得預測的類別
predicted_classes = tf.argmax(predictions.logits, axis=1)

# 透過 LabelEncoder 將預測的數字類別轉換回原本的標籤
predicted_labels = label_encoder.inverse_transform(predicted_classes.numpy())



In [14]:
# 建立 submission.csv
tweet_ids = testdata['tweet.tweet_id']
new_df = pd.DataFrame({'id': tweet_ids, 'emotion': predicted_labels})
new_df.to_csv('submission.csv', index=False)

In [15]:
new_df

Unnamed: 0,id,emotion
2,0x28b412,anticipation
4,0x2de201,anticipation
9,0x218443,sadness
30,0x2939d5,joy
33,0x26289a,trust
...,...,...
1867525,0x2913b4,anticipation
1867529,0x2a980e,anticipation
1867530,0x316b80,sadness
1867531,0x29d0cb,anger
