## 前處理統一版本 (from 彥文)

In [3]:
# 載入套件
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer, word_tokenize
import re
import string
import contractions
from collections import Counter
from itertools import product
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import seaborn as sns

# 下載 NLTK 資源
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\skych\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\skych\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\skych\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\skych\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\skych\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\skych\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [4]:
# 檢查 GPU 可用性並設置設備
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用設備：{device}")
print(f"PyTorch 版本：{torch.__version__}")
print(f"GPU 可用：{torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU 名稱：{torch.cuda.get_device_name(0)}")

使用設備：cuda
PyTorch 版本：2.5.1+cu121
GPU 可用：True
GPU 名稱：NVIDIA GeForce RTX 4060 Laptop GPU


In [5]:
# 初始化工具
tokenizer = TweetTokenizer(preserve_case=False)
encoder = LabelEncoder()
custom_stopwords = set(stopwords.words('english')) - {"not", "no", "never"}

# 定義資料清理函數（整合新程式碼，滿足原始要求）
def clean_text(text, use_stopwords=False, replace_username=True, replace_covid='none'):
    if pd.isna(text):
        return []
    
    # 小寫
    text = text.lower()
    
    # 展開縮寫（新程式碼功能）
    text = contractions.fix(text)
    
    # 移除網址
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    
    # 處理 @人名（按原始要求替換為 username）
    if replace_username:
        text = re.sub(r"@\w+", 'username', text)
    
    # 處理 covid/coronavirus（按原始要求替換為 virus）
    if replace_covid == 'virus':
        text = re.sub(r"\bcovid\b|\bcovid19\b|\bcoronavirus\b", 'virus', text, flags=re.IGNORECASE)
    elif replace_covid == 'pandemic':
        text = re.sub(r"\bcovid\b|\bcovid19\b|\bcoronavirus\b", 'pandemic', text, flags=re.IGNORECASE)
    
    # 保留字母（移除非字母字符）
    text = re.sub(r"[^a-zA-Z \s]", '', text)
    
    # 去除多餘空白
    text = re.sub(r"\s+", " ", text).strip()
    
    # 分詞（使用 word_tokenize，與新程式碼一致）
    tokens = word_tokenize(text)
    
    # 移除停用詞（若啟用）
    if use_stopwords:
        tokens = [word for word in tokens if word not in custom_stopwords]
    
    return tokens

In [6]:
# 讀取資料
df_train = pd.read_csv('Corona_NLP_train.csv', encoding='latin_1')
df_test = pd.read_csv('Corona_NLP_test.csv', encoding='latin_1')
df_train, df_val = train_test_split(df_train, test_size=0.2, stratify=df_train["Sentiment"], random_state=42)

# 顯示資料統計
print("訓練集資料筆數：", len(df_train))
print("訓練集欄位：", df_train.columns.tolist())
print("訓練集情緒分布：\n", df_train["Sentiment"].value_counts())
print("驗證集資料筆數：", len(df_val))
print("驗證集欄位：", df_val.columns.tolist())
print("驗證集情緒分布：\n", df_val["Sentiment"].value_counts())
print("測試集資料筆數：", len(df_test))
print("測試集欄位：", df_test.columns.tolist())
print("測試集情緒分布：\n", df_test["Sentiment"].value_counts())

# 套用前處理函數（初始清理，無停用詞）
df_train["clean_tokens"] = df_train["OriginalTweet"].apply(lambda x: clean_text(x, use_stopwords=False))
df_val["clean_tokens"] = df_val["OriginalTweet"].apply(lambda x: clean_text(x, use_stopwords=False))
df_test["clean_tokens"] = df_test["OriginalTweet"].apply(lambda x: clean_text(x, use_stopwords=False))

# 檢查空序列並移除
empty_train = df_train[df_train['clean_tokens'].apply(len) == 0]
if not empty_train.empty:
    print(f"警告：訓練集有 {len(empty_train)} 筆空序列，移除中...")
    print("空序列範例：")
    print(empty_train[['OriginalTweet', 'clean_tokens']].head())
    df_train = df_train[df_train['clean_tokens'].apply(len) > 0]

empty_val = df_val[df_val['clean_tokens'].apply(len) == 0]
if not empty_val.empty:
    print(f"警告：驗證集有 {len(empty_val)} 筆空序列，移除中...")
    print("空序列範例：")
    print(empty_val[['OriginalTweet', 'clean_tokens']].head())
    df_val = df_val[df_val['clean_tokens'].apply(len) > 0]

empty_test = df_test[df_test['clean_tokens'].apply(len) == 0]
if not empty_test.empty:
    print(f"警告：測試集有 {len(empty_test)} 筆空序列，移除中...")
    print("空序列範例：")
    print(empty_test[['OriginalTweet', 'clean_tokens']].head())
    df_test = df_test[df_test['clean_tokens'].apply(len) > 0]

# 處理情緒標籤
df_train["SentimentEncoded"] = encoder.fit_transform(df_train["Sentiment"])
df_val["SentimentEncoded"] = encoder.transform(df_val["Sentiment"])
df_test["SentimentEncoded"] = encoder.transform(df_test["Sentiment"])

# 顯示處理結果
print("\n訓練集處理結果：")
print(df_train[["OriginalTweet", "clean_tokens", "Sentiment", "SentimentEncoded"]].head())
print("\n驗證集處理結果：")
print(df_val[["OriginalTweet", "clean_tokens", "Sentiment", "SentimentEncoded"]].head())
print("\n測試集處理結果：")
print(df_test[["OriginalTweet", "clean_tokens", "Sentiment", "SentimentEncoded"]].head())

訓練集資料筆數： 32925
訓練集欄位： ['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'Sentiment']
訓練集情緒分布：
 Positive              9137
Negative              7934
Neutral               6170
Extremely Positive    5299
Extremely Negative    4385
Name: Sentiment, dtype: int64
驗證集資料筆數： 8232
驗證集欄位： ['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'Sentiment']
驗證集情緒分布：
 Positive              2285
Negative              1983
Neutral               1543
Extremely Positive    1325
Extremely Negative    1096
Name: Sentiment, dtype: int64
測試集資料筆數： 3798
測試集欄位： ['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'Sentiment']
測試集情緒分布：
 Negative              1041
Positive               947
Neutral                619
Extremely Positive     599
Extremely Negative     592
Name: Sentiment, dtype: int64
警告：訓練集有 10 筆空序列，移除中...
空序列範例：
                                           OriginalTweet clean_tokens
29888  ???? ????? \r\r\n????? ??? ? ?? ?? ??\r\r\n\r\...           []
2