In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.nn.utils.rnn import pad_sequence
import numpy as np

In [2]:
df = pd.read_csv("netflix_reviews.csv")  # 파일 불러오기
df = df.iloc[:,0:5]

In [3]:
df

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount
0,166731e8-4b10-4968-838d-329473357328,Sylviah Chichi,Great App on the move ..... I can watch my mov...,5,0
1,68bab7d0-2afc-4454-970f-159ced93d751,Marilyn Goeda,good,5,0
2,6c2d3e85-b5ca-4228-93bd-abd2236eab51,Nikhil Pk,Need to improve and to update some error durin...,3,0
3,d62b0303-4c87-4c96-9c2c-a3ca6e0b056d,Mmesoma Eberechukwu,"Netflix is a nice app,but not all the movies a...",3,0
4,d6014252-863e-4e06-b440-25e0ece47a31,Keabetswe Monaise,Not much availability considering options on w...,3,0
...,...,...,...,...,...
117129,a760ead9-e7aa-4ed1-a651-5c37c3600dac,A Google user,i really like it! there are so many movies and...,5,0
117130,4957f9e7-d7f4-4a52-9764-031cebcac83f,Captain Jeoy,I love Netflix. I always enjoy my time using it.,5,0
117131,9acf7586-7abf-4b50-8c50-3ede3b2a42c4,Suryansh,Sound quality is very slow of movies,1,0
117132,32870f7f-c461-4256-b602-75244ca60248,A Google user,Rate is very expensive.. bcos we see netflix s...,1,0


In [4]:
# 전처리 함수
import re
import emoji


# 이모티콘만 추출하는 함수 (중복 제거)
def remove_duplicate_emojis(text):
    # 유니코드 이모티콘 범위에 해당하는 모든 이모티콘을 찾음
    emoji_pattern = re.compile("[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F]", flags=re.UNICODE)
    
    # 중복 제거를 위한 세트 (set) 사용
    emojis = set(emoji_pattern.findall(text))
    
    # 텍스트에서 중복된 이모티콘을 제거하고, 하나의 이모티콘만 남김
    for em in emojis:
        text = re.sub(em + '+', em, text)  # 중복된 이모티콘을 하나로 줄임
    
    return text

# 전처리 함수 (이모티콘 중복 제거 후 텍스트로 변환)
def preprocess_text(text):
    if isinstance(text, float):
        return ""
    
    # 이모티콘 중복 제거
    text = remove_duplicate_emojis(text)
    
    # 이모티콘을 텍스트로 변환
    text = emoji.demojize(text, delimiters=(" ", " "))
    
    # 소문자로 변환
    text = text.lower()
    
    # 숫자 및 구두점 제거
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    
    # 앞뒤 공백 제거
    text = text.strip()
    
    return text

df['content'] = df['content'].apply(preprocess_text)

In [5]:
print(df['content'])
print('데이터 타입 : ', type(df['content'])) # 데이터 타입은 pandas 시리즈인걸 확인 할 수 있다.
print('데이터 타입 : ', type(df['score']))

0         great app on the move  i can watch my movies a...
1                                                      good
2         need to improve and to update some error durin...
3         netflix is a nice appbut not all the movies ar...
4         not much availability considering options on w...
                                ...                        
117129    i really like it there are so many movies and ...
117130       i love netflix i always enjoy my time using it
117131                 sound quality is very slow of movies
117132    rate is very expensive bcos we see netflix sun...
117133    this app is awesome for english movies series ...
Name: content, Length: 117134, dtype: object
데이터 타입 :  <class 'pandas.core.series.Series'>
데이터 타입 :  <class 'pandas.core.series.Series'>


In [6]:
# 1. ratings를 바로 텐서로 변환
ratings = torch.tensor(df['score'], dtype=torch.long) 
print("정수형 텐서로 변환된 평점:", ratings, type(ratings)) 

정수형 텐서로 변환된 평점: tensor([5, 5, 3,  ..., 1, 1, 4]) <class 'torch.Tensor'>


In [7]:
# 2. reviews
reviews = df['content'].tolist()  # 'content'를 리스트로 변환

In [8]:
print(reviews[0:5])
print(type(reviews))

['great app on the move  i can watch my movies and series from any place', 'good', 'need to improve and to update some error during movie playing', 'netflix is a nice appbut not all the movies are there especially the one you want to watch they need to work on that is already getting annoying enraged_face', 'not much availability considering options on what to watch']
<class 'list'>


In [9]:
# (1) 토크나이저 정의 (기본 영어 토크나이저)
tokenizer = get_tokenizer('basic_english')

# (2) 어휘 사전 생성 함수
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

# (3) 어휘 사전 생성
vocab = build_vocab_from_iterator(yield_tokens(reviews))


# (4) 파이프라인 정의
# 텍스트를 토큰화 후 정수 인덱스로 변환하는 함수
def text_pipeline(text):
    return [vocab[token] for token in tokenizer(text)]

# 레이블을 그대로 정수로 반환하는 함수
def label_pipeline(label):
    return label

# 데이터셋 클래스 정의
class ReviewDataset(Dataset):
    def __init__(self, reviews, ratings, text_pipeline, label_pipeline):
        self.reviews = reviews
        self.ratings = ratings
        self.text_pipeline = text_pipeline
        self.label_pipeline = label_pipeline

    def __len__(self):
        return len(self.reviews) 

    def __getitem__(self, idx):
        review = self.text_pipeline(self.reviews[idx])
        rating = self.label_pipeline(self.ratings[idx])
        return torch.tensor(review, dtype=torch.long), torch.tensor(rating, dtype=torch.long) # 정수형 데이터로 변환


117134lines [00:01, 82253.88lines/s]


In [10]:
# 데이터를 학습용(train)과 테스트용(test)으로 분리
train_reviews, test_reviews, train_ratings, test_ratings = train_test_split(reviews, ratings, test_size=0.2, random_state=42)

# 데이터셋 정의
train_dataset = ReviewDataset(train_reviews, train_ratings, text_pipeline, label_pipeline)
test_dataset = ReviewDataset(test_reviews, test_ratings, text_pipeline, label_pipeline)

# 패딩을 적용하는 함수 정의

# collate_fn 함수 수정
def collate_fn(batch):
    # reviews와 ratings를 분리
    reviews, ratings = zip(*batch)
    
    # 리뷰를 텐서로 변환하고 패딩 적용 : 리뷰마다 텍스트 길이가 다름.
    reviews = pad_sequence([torch.tensor(r, dtype=torch.long) for r in reviews], batch_first=True, padding_value=0)
    
    # 평점은 정수형 텐서로 변환 (패딩 필요 없음)
    ratings = torch.tensor(ratings, dtype=torch.long)
    
    return reviews, ratings




In [11]:
# 데이터 로더 정의
BATCH_SIZE = 64

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [12]:
# LSTM 모델 정의
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)  # Embedding으로 변경
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.lstm(embedded)
        return self.fc(hidden[-1])

# 하이퍼파라미터 정의
VOCAB_SIZE = len(vocab)
EMBED_DIM = 64
HIDDEN_DIM = 128
OUTPUT_DIM = len(set(ratings))  # 예측할 점수 개수 5개임 (평점이 정수형)

In [19]:
# 모델 초기화
model = LSTMModel(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, OUTPUT_DIM)

# 손실 함수와 옵티마이저 정의
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)  # SGD 에서 Adam으로 변경 lr : 0.01 - > 0.001 / Accuracy: 63% -> 61.59% 다시 

# 모델을 CUDA로 이동 (가능한 경우)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 모델 학습 함수 정의
def train_model(model, train_dataloader, criterion, optimizer, num_epochs=20):
    model.train()  # 학습 모드로 설정
    for epoch in range(num_epochs):
        total_loss = 0  # 에포크마다 손실을 추적
        for i, (reviews, ratings) in enumerate(train_dataloader):
            reviews, ratings = reviews.to(device), ratings.to(device)  # 데이터를 GPU로 이동
            
            optimizer.zero_grad()
            outputs = model(reviews)  # 모델에 입력하여 예측값 계산
            loss = criterion(outputs, ratings)  # 손실 계산
            loss.backward()  # 역전파
            optimizer.step()  # 가중치 업데이트

            total_loss += loss.item()
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Average Loss: {total_loss/len(train_dataloader):.4f}')
    
    print("Finished Training")

# 모델 학습 실행
train_model(model, train_dataloader, criterion, optimizer, num_epochs=20)

# 모델 평가

correct = 0
total = 0
with torch.no_grad():  # 평가 시에는 기울기 계산을 하지 않음
    for reviews, ratings in test_dataloader:
        reviews, ratings = reviews.to(device), ratings.to(device)
        outputs = model(reviews)
        _, predicted = torch.max(outputs, 1)
        total += ratings.size(0)
        correct += (predicted == ratings).sum().item()

print(f'Accuracy: {100 * correct / total}%')

  return torch.tensor(review, dtype=torch.long), torch.tensor(rating, dtype=torch.long) # 정수형 데이터로 변환
  reviews = pad_sequence([torch.tensor(r, dtype=torch.long) for r in reviews], batch_first=True, padding_value=0)


Epoch [1/25], Average Loss: 1.2241
Epoch [2/25], Average Loss: 1.0057
Epoch [3/25], Average Loss: 0.9454
Epoch [4/25], Average Loss: 0.9139
Epoch [5/25], Average Loss: 0.8877
Epoch [6/25], Average Loss: 0.8728
Epoch [7/25], Average Loss: 0.8644
Epoch [8/25], Average Loss: 0.8510
Epoch [9/25], Average Loss: 0.8400
Epoch [10/25], Average Loss: 0.8345
Epoch [11/25], Average Loss: 0.8278
Epoch [12/25], Average Loss: 0.8186
Epoch [13/25], Average Loss: 0.8113
Epoch [14/25], Average Loss: 0.8229
Epoch [15/25], Average Loss: 0.8095
Epoch [16/25], Average Loss: 0.7998
Epoch [17/25], Average Loss: 0.7928
Epoch [18/25], Average Loss: 0.7869
Epoch [19/25], Average Loss: 0.7851
Epoch [20/25], Average Loss: 0.7844
Epoch [21/25], Average Loss: 0.7838
Epoch [22/25], Average Loss: 0.7768
Epoch [23/25], Average Loss: 0.7777
Epoch [24/25], Average Loss: 0.7775
Epoch [25/25], Average Loss: 0.7745
Finished Training
Accuracy: 62.67554531096598%


In [21]:
# 예측 함수
def predict_review(model, review, vocab, tokenizer, device):
    # 리뷰를 텐서로 변환
    tokens = [vocab[token] for token in tokenizer(review)]
    review_tensor = torch.tensor(tokens).unsqueeze(0)  # (1, seq_length) 형태로 만듦
    
    # 텐서를 GPU로 이동
    review_tensor = review_tensor.to(device)
    
    # 모델에 입력하여 예측값 계산
    model.eval()  # 평가 모드로 변경
    with torch.no_grad():  # 평가 시에는 기울기 계산을 하지 않음
        output = model(review_tensor)
        _, predicted = torch.max(output, 1)
    
    return predicted.item()  # 예측된 평점 반환
# 새로운 리뷰에 대한 예측
new_review = "This app is great but has some bugs."
predicted_score = predict_review(model, new_review, vocab, tokenizer,  device)
print(f'Predicted Score: {predicted_score}')

Predicted Score: 5


In [42]:
# 새로운 리뷰에 대한 예측2
new_review = "Good app for streaming occasionally, but this is the only app I have that completely malfunctions my user interface, and forces me to restart my phone. No idea why it happens, but it is profoundly annoying."
predicted_score = predict_review(model, new_review, vocab, tokenizer,  device)
print(f'Predicted Score: {predicted_score}')

Predicted Score: 3


Predicted Score: 4
