In [2]:
import pandas as pd
IMDB=pd.read_csv("IMDB Dataset.csv")
IMDB

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
print(IMDB.describe())
print(IMDB.info())

                                                   review sentiment
count                                               50000     50000
unique                                              49582         2
top     Loved today's show!!! It was a variety and not...  positive
freq                                                    5     25000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None


EDA 진행 결과 IMDB 데이터 셋의 열은 review와 sentiment로 구성,
각 열에 해당되는 행의 개수는 총 50000개이며 고유값들은 각각 49582개, 2개임.
이 리뷰를 lstm모델을 활용하여 긍정 혹은 부정으로 예측하는 모델을 구축할것임.

In [4]:
# 데이터 정제
duplicates = IMDB[IMDB.duplicated()]  
print("Duplicated Rows:\n", duplicates)

Duplicated Rows:
                                                   review sentiment
3537   Quite what the producers of this appalling ada...  negative
3769   My favourite police series of all time turns t...  positive
4391   Beautiful film, pure Cassavetes style. Gena Ro...  positive
6352   If you liked the Grinch movie... go watch that...  negative
6479   I want very much to believe that the above quo...  negative
...                                                  ...       ...
49912  This is an incredible piece of drama and power...  positive
49950  This was a very brief episode that appeared in...  negative
49984  Hello it is I Derrick Cannon and I welcome you...  negative
49986  This movie is a disgrace to the Major League F...  negative
49991  Les Visiteurs, the first movie about the medie...  negative

[418 rows x 2 columns]


In [5]:
IMDB = IMDB.drop_duplicates(subset='review') # 중복되는 행 제거
IMDB.describe()

Unnamed: 0,review,sentiment
count,49582,49582
unique,49582,2
top,One of the other reviewers has mentioned that ...,positive
freq,1,24884


In [6]:
IMDB.loc[:, 'sentiment']

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 49582, dtype: object

In [7]:
IMDB.loc[:, 'sentiment'] = IMDB['sentiment'].map({'positive': 1, 'negative': 0})
IMDB.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [8]:
# 데이터 전처리
# 1. 텍스트데이터 정제
import re
from nltk.corpus import stopwords
import nltk
stop_words = set(stopwords.words('english'))
def clean_text(text):
    text = text.lower() # 소문자화
    text = re.sub(r"<.*?>", "", text) # HTML태그 제거 
    text = re.sub(r"[^a-z\s!?']", "", text) # 특수 문자 및 숫자 제거
    text = " ".join(word for word in text.split() if word not in stop_words) # 불용어 제거
    text = re.sub(r'\s+', ' ', text).strip() # 불필요한 공백 제거
    return text
IMDB['review_cleaned'] = IMDB['review'].apply(clean_text)
IMDB.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  IMDB['review_cleaned'] = IMDB['review'].apply(clean_text)


Unnamed: 0,review,sentiment,review_cleaned
0,One of the other reviewers has mentioned that ...,1,one reviewers mentioned watching oz episode ho...


In [9]:
# 2. 토큰화
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
IMDB.loc[:,'review_tokenized'] = IMDB.loc[:,'review_cleaned'].apply(tokenizer.tokenize)
print(IMDB.head(3))

                                              review sentiment  \
0  One of the other reviewers has mentioned that ...         1   
1  A wonderful little production. <br /><br />The...         1   
2  I thought this was a wonderful way to spend ti...         1   

                                      review_cleaned  \
0  one reviewers mentioned watching oz episode ho...   
1  wonderful little production filming technique ...   
2  thought wonderful way spend time hot summer we...   

                                    review_tokenized  
0  [one, reviewers, mentioned, watching, oz, epis...  
1  [wonderful, little, production, filming, techn...  
2  [thought, wonderful, way, spend, time, hot, su...  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  IMDB.loc[:,'review_tokenized'] = IMDB.loc[:,'review_cleaned'].apply(tokenizer.tokenize)


In [10]:
# 3. 단어집합 생성
import numpy as np
from nltk import FreqDist
all_tokens = np.hstack(IMDB['review_tokenized'])
# 단어 집합 생성 및 빈도 계산
vocab = FreqDist(all_tokens)
# 단어와 인덱스 할당
# 단어 인덱스를 2부터 시작하여 word2idx 생성 -> 레이블 데이터를 1과 0으로 설정 했기 때문
word2idx = {word: idx + 2 for idx, (word, _) in enumerate(vocab.items())}
# 특수 토큰 추가 (옵션)
word2idx['<pad>'] = 0  # 패딩을 위한 인덱스 0 예약
word2idx['<unk>'] = 1  # 알 수 없는 단어를 위한 인덱스 1 예약
word2idx

{'one': 2,
 'reviewers': 3,
 'mentioned': 4,
 'watching': 5,
 'oz': 6,
 'episode': 7,
 'hooked': 8,
 'right': 9,
 'exactly': 10,
 'happened': 11,
 'methe': 12,
 'first': 13,
 'thing': 14,
 'struck': 15,
 'brutality': 16,
 'unflinching': 17,
 'scenes': 18,
 'violence': 19,
 'set': 20,
 'word': 21,
 'go': 22,
 'trust': 23,
 'show': 24,
 'faint': 25,
 'hearted': 26,
 'timid': 27,
 'pulls': 28,
 'punches': 29,
 'regards': 30,
 'drugs': 31,
 'sex': 32,
 'hardcore': 33,
 'classic': 34,
 'use': 35,
 'wordit': 36,
 'called': 37,
 'nickname': 38,
 'given': 39,
 'oswald': 40,
 'maximum': 41,
 'security': 42,
 'state': 43,
 'penitentary': 44,
 'focuses': 45,
 'mainly': 46,
 'emerald': 47,
 'city': 48,
 'experimental': 49,
 'section': 50,
 'prison': 51,
 'cells': 52,
 'glass': 53,
 'fronts': 54,
 'face': 55,
 'inwards': 56,
 'privacy': 57,
 'high': 58,
 'agenda': 59,
 'em': 60,
 'home': 61,
 'manyaryans': 62,
 'muslims': 63,
 'gangstas': 64,
 'latinos': 65,
 'christians': 66,
 'italians': 67,
 'ir

In [11]:
# 4. 맵핑(단어 집합을 데이터에 적용)
encoded = []
for line in IMDB['review_tokenized']: #입력 데이터에서 1줄씩 문장을 읽음
    temp = []
    for w in line: #각 줄에서 1개씩 글자를 읽음
      try:
        temp.append(word2idx[w]) # 글자를 해당되는 정수로 변환
      except KeyError: # 단어 집합에 없는 단어일 경우 unk로 대체된다.
        temp.append(word2idx['unk']) # unk의 인덱스로 변환

    encoded.append(temp)

if len(encoded[0])==len(encoded[1]) :
   print("True")
else : print("False")  # -> 패딩 작업 필요

False


In [12]:
# 5. 패딩 작업
import torch
from torch.nn.utils.rnn import pad_sequence

# 정수 인코딩된 시퀀스를 PyTorch 텐서로 변환
encoded_tensors = [torch.tensor(seq) for seq in encoded]

# 패딩 적용 (최대 길이에 맞춰 0으로 패딩)
padded_sequences = pad_sequence(encoded_tensors, batch_first=True, padding_value=0)

print("Padded Sequences:\n", padded_sequences)
print("Shape:", padded_sequences.shape)  # (배치 크기, 최대 시퀀스 길이)

Padded Sequences:
 tensor([[    2,     3,     4,  ...,     0,     0,     0],
        [  141,   142,   143,  ...,     0,     0,     0],
        [  214,   141,   215,  ...,     0,     0,     0],
        ...,
        [11913, 17368, 71887,  ...,     0,     0,     0],
        [  254,   839,   303,  ...,     0,     0,     0],
        [    2,  9132,   674,  ...,     0,     0,     0]])
Shape: torch.Size([49582, 1469])


In [14]:
# 6. 패딩이 적용된 시퀀스와 레이블 합치기
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader

labels=IMDB['sentiment']
labels_tensor = torch.tensor(labels, dtype=torch.long) 
dataset = TensorDataset(padded_sequences, labels_tensor)

# DataLoader를 사용해 데이터를 배치 단위로 가져오기
dataloader = DataLoader(dataset, batch_size=2, shuffle=False)

# 배치를 확인하며 첫 번째 배치 출력
for batch in dataloader:
    inputs, targets = batch
    print("Padded Sequences:\n", inputs)
    print("Labels:\n", targets)
    break  # 첫 번째 배치만 출력

Padded Sequences:
 tensor([[  2,   3,   4,  ...,   0,   0,   0],
        [141, 142, 143,  ...,   0,   0,   0]])
Labels:
 tensor([1, 1])
