# 1. Import

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import re
from nltk.tokenize import word_tokenize, sent_tokenize


# 2. Data Load

In [None]:
# Load data
train = pd.read_csv('./train.csv', encoding='utf-8')
test = pd.read_csv('./test_x.csv', encoding='utf-8')
sample_submission = pd.read_csv('./sample_submission.csv', encoding='utf-8')

# 3. 전처리

## 3.1 문자 제거

In [None]:
# 텍스트에서 알파벳 문자와 숫자, 공백을 제외한 모든 문자를 제거하는 함수
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

train['text'] = train['text'].apply(alpha_num)

## 3.2 불용어 제거

In [None]:
# 불용어 제거
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)

train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()
train['text'] = train['text'].apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].apply(alpha_num).apply(remove_stopwords)


## 3.3 토큰화

과제 1 : 문장 토큰화, 문자기반 토큰화, 하위단어 토큰화를 진행하는 코드를 각각 작성해보세요.

In [None]:
# 단어기반 토큰화
def word_based_tokenization(texts):
    return [" ".join(word_tokenize(text)) for text in texts]

train['text'] = word_based_tokenization(train['text'])
test['text'] = word_based_tokenization(test['text'])

## 3.4 벡터화

과제 2 : TF-IDF (Term Frequency-Inverse Document Frequency) 벡터화를 진행하는 코드를 작성해보세요.

In [None]:
# Tokenization
from sklearn.feature_extraction.text import CountVectorizer

vocab_size = 20000
embedding_dim = 16
max_length = 500

vectorizer = CountVectorizer(max_features=vocab_size)
X_train = vectorizer.fit_transform(train['text']).toarray()
X_test = vectorizer.transform(test['text']).toarray()


## 3.5 패딩

과제3 : 패딩하기 전과 후의 차이를 비교해보세요.

In [None]:
# Padding or truncating to max_length
if X_train.shape[1] < max_length:
    X_train = np.pad(X_train, ((0,0),(0, max_length - X_train.shape[1])), 'constant')
else:
    X_train = X_train[:, :max_length]

if X_test.shape[1] < max_length:
    X_test = np.pad(X_test, ((0,0),(0, max_length - X_test.shape[1])), 'constant')
else:
    X_test = X_test[:, :max_length]


## 3.6 레이블 인코딩

In [None]:
# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train['author'])

# 4. 데이터셋 정의

In [None]:
# Create Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels=None):
        self.texts = texts
        self.labels = labels
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        if self.labels is not None:
            return torch.tensor(self.texts[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)
        return torch.tensor(self.texts[idx], dtype=torch.float32)

train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 5. 모델 정의

과제 4 : Word2Vec, GloVe를 활용해서 임베딩하는 코드를 각각 모델에 추가 작성해보세요.

In [None]:

# Define the model
class SimpleNLPModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim):
        super(SimpleNLPModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim, 24)
        self.fc2 = nn.Linear(24, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)
        pooled = embedded.mean(dim=1)
        x = torch.relu(self.fc1(pooled))
        x = self.fc2(x)
        return x


# 6. 학습 및 평가

In [None]:

model = SimpleNLPModel(vocab_size, embedding_dim, 5)

# Training the model
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20

model.train()
for epoch in range(num_epochs):
    for texts, labels in tqdm(train_loader):
        optimizer.zero_grad()
        output = model(texts.long())
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

# Predict
model.eval()
preds = []
with torch.no_grad():
    for texts in test_loader:
        output = model(texts.long())
        preds.append(torch.softmax(output, dim=1).numpy())

preds = np.concatenate(preds, axis=0)

# Prepare submission
sample_submission[['0', '1', '2', '3', '4']] = preds
sample_submission.to_csv('submission.csv', index=False, encoding='utf-8')