In [None]:
! pip install -U transformers
! pip install -U simpletransformers 
!pip install sentencepiece

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import operator 

In [None]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
# XLNet tokenizer 및 모델 로드
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
model = AutoModel.from_pretrained("xlnet-base-cased")

In [None]:

# 데이터 로드
data = pd.read_csv('/content/drive/MyDrive/빅데이터처리 개인 폴더/steam_data(to 40000).csv',encoding='UTF-8')
text_data = data['Game Description'].tolist()[:8192]


def remove_stopwords(text):
    # 불용어 제거 
    stopwords = nltk.corpus.stopwords.words('english')
    stop_words = set(stopwords)
    words = word_tokenize(str(text))

    filtered_words = [word for word in words if word.casefold() not in stop_words]
    # 다시 문장으로 만들기 
    filtered_sentence = ' '.join(filtered_words)
    return filtered_sentence

def preprocess_text(text):
    # 특수 문자 제거
    text = re.sub(r'[^\w\s]', ' ', text)

    return text

# 데이터셋 클래스 정의
class MyDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data[index]
        text = remove_stopwords(text)
        text = preprocess_text(text)
        encoded_input = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )
        return encoded_input

# 데이터셋 인스턴스 생성
dataset = MyDataset(text_data, tokenizer)

# 데이터로더 생성
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# 모델 학습 설정
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# 학습 루프
model.train()
num_epochs = 16

for epoch in range(num_epochs):
    for batch in dataloader:
        input_ids = batch['input_ids'].squeeze().to(device)
        attention_mask = batch['attention_mask'].squeeze().to(device)

        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # 역전파 및 가중치 업데이트
        last_hidden_state = outputs.last_hidden_state

        loss = last_hidden_state[0].mean()
        
        loss.backward()
        optimizer.step()

In [None]:
# 모델 저장
torch.save(model.state_dict(), '/content/drive/MyDrive/빅데이터처리 개인 폴더/fine_tunning_xlnet_model.pt')