In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
path="/content/drive/MyDrive/ML/TeamProject"
movies_metadataset = os.path.join(path, "movies_metadata.csv")
rating_dataset = os.path.join(path, "ratings_small.csv")


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, MinMaxScaler


In [None]:
# 영화 정보 데이터 로드
movies_df = pd.read_csv(movies_metadataset, low_memory=False)[['id', 'original_title', 'release_date', 'vote_average', 'vote_count']]
# 사용자 평점 데이터 로드
ratings_df = pd.read_csv(rating_dataset)[['userId', 'movieId', 'rating']]


In [None]:
print(movies_df)

           id               original_title release_date  vote_average  \
0         862                    Toy Story   1995-10-30           7.7   
1        8844                      Jumanji   1995-12-15           6.9   
2       15602             Grumpier Old Men   1995-12-22           6.5   
3       31357            Waiting to Exhale   1995-12-22           6.1   
4       11862  Father of the Bride Part II   1995-02-10           5.7   
...       ...                          ...          ...           ...   
45461  439050                      رگ خواب          NaN           4.0   
45462  111109          Siglo ng Pagluluwal   2011-11-17           9.0   
45463   67758                     Betrayal   2003-08-01           3.8   
45464  227506          Satana likuyushchiy   1917-10-21           0.0   
45465  461257                     Queerama   2017-06-09           0.0   

       vote_count  
0          5415.0  
1          2413.0  
2            92.0  
3            34.0  
4           173.0  
...

In [None]:
movies_df['id'] = pd.to_numeric(movies_df['id'], errors='coerce')
movies_df = movies_df.dropna(subset=['id'])
movies_df['id'] = movies_df['id'].astype(int)

ratings_df['movieId'] = pd.to_numeric(ratings_df['movieId'], errors='coerce')
ratings_df = ratings_df.dropna(subset=['movieId'])
ratings_df['movieId'] = ratings_df['movieId'].astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['id'] = movies_df['id'].astype(int)


In [None]:
# ratings 데이터셋에 movies 데이터셋을 조인
dataset = pd.merge(ratings_df, movies_df, left_on='movieId', right_on='id')[['userId', 'movieId', 'original_title', 'vote_average', 'vote_count', 'rating']]


In [None]:
# Label Encoding을 위한 객체 생성
le = LabelEncoder()

# movieId와 original_title에 대해 Label Encoding 수행
dataset['movieId'] = le.fit_transform(dataset['movieId'])
dataset['original_title'] = le.fit_transform(dataset['original_title'])

# vote_average와 vote_count에 대해서는 Min-Max Scaling 수행
scaler = MinMaxScaler()
dataset[['vote_average', 'vote_count']] = scaler.fit_transform(dataset[['vote_average', 'vote_count']])

In [None]:
print(dataset)

       userId  movieId  original_title  vote_average  vote_count  rating
0           1      704            1759          0.66    0.072867     2.5
1           4      704            1759          0.66    0.072867     4.0
2           7      704            1759          0.66    0.072867     3.0
3          19      704            1759          0.66    0.072867     4.0
4          21      704            1759          0.66    0.072867     3.0
...       ...      ...             ...           ...         ...     ...
44989     652     2813            1338          0.40    0.000082     4.0
44990     653     1071            1893          0.59    0.030809     3.0
44991     659      106            1103          0.71    0.057869     4.0
44992     659      378            1944          0.67    0.129106     3.0
44993     665       78            2736          0.83    0.323417     3.0

[44994 rows x 6 columns]


In [None]:
print(dataset['movieId'])

0         704
1         704
2         704
3         704
4         704
         ... 
44989    2813
44990    1071
44991     106
44992     378
44993      78
Name: movieId, Length: 44994, dtype: int64


In [None]:
# 학습/검증/테스트 데이터셋으로 분리
train_data = dataset.sample(frac=0.8)
test_data = dataset.drop(train_data.index).sample(frac=0.5)
val_data = dataset.drop(train_data.index).drop(test_data.index)

In [None]:
# 데이터를 PyTorch Dataset으로 변환
class RecommenderDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        # 입력 데이터 - categorical, continuous
        x = self.data.iloc[index, :-1].values.astype(np.float32)
        
        
        # 출력 데이터
        y = self.data.iloc[index, -1:].values.astype(np.float32)
        
        return x,y

In [None]:
train_dataset = RecommenderDataset(train_data)
test_dataset = RecommenderDataset(test_data)
val_dataset = RecommenderDataset(val_data)

In [None]:
# DataLoader 생성
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=True)

In [None]:
class WideAndDeep(nn.Module):
    def __init__(self, num_categories, num_continuous):
        super(WideAndDeep, self).__init__()
        # Wide 부분: 카테고리 피처에 대한 선형 레이어 (2798,1)
        self.wide = nn.Linear(num_categories, 1)
        # Deep 부분: 레이어들을 쌓은 신경망
        self.deep = nn.Sequential(
            nn.Linear(num_categories + num_continuous, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )
    
    def forward(self, x_cat, x_cont):
        # Wide 부분의 출력
        wide_output = self.wide(x_cat)
        # Deep 부분의 출력
        deep_output = self.deep(torch.cat((x_cat, x_cont), dim=1))
        # Wide와 Deep의 출력을 더하여 최종 출력
        output = wide_output + deep_output
        
        return output

In [None]:
print(len(le.classes_))

2798


In [None]:
# 모델 인스턴스화
model = WideAndDeep(num_categories=len(le.classes_), num_continuous=2)

# 손실 함수: MSE
loss_fn = nn.MSELoss()

# 최적화 알고리즘: Adam
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")



In [None]:
def train(model, dataloader, loss_fn, optimizer, device):
    # 모델 학습 모드로 설정
    model.train()
    
    # 전체 손실
    total_loss = 0.0
    # 배치 개수
    n_batches = 0
    
    # 배치 단위로 학습
    for i, (x,y_true) in enumerate(dataloader):
        print(x)
        print(x.size())

        x_cat = x[:, :3]  # 카테고리 변수
        x_cont = x[:, 3:]  # 연속 변수


        # 데이터를 디바이스에 전송
        x_cat, x_cont, y_true = x_cat.to(device), x_cont.to(device), y_true.to(device)
        
        # 경사 초기화
        optimizer.zero_grad()
        
        # 모델에 입력하여 출력 예측
        y_pred = model(x_cat, x_cont)
        
        # 손실 계산
        loss = loss_fn(y_pred, y_true)
        
        # 역전파 수행하여 기울기 계산
        loss.backward()
        
        # 가중치 업데이트
        optimizer.step()
        
        # 전체 손실에 누적
        total_loss += loss.item()
        # 배치 개수 증가
        n_batches += 1
    
    # 에폭 손실 평균 계산
    avg_loss = total_loss / n_batches
    
    # 출력
    print(f"Train Loss: {avg_loss:.4f}")

    return model

In [None]:
for epoch in range(10):
    print(f"Epoch {epoch+1}")
    model = train(model, train_dataloader, loss_fn, optimizer, device)

Epoch 1
tensor([[4.7800e+02, 1.7060e+03, 3.4200e+02, 7.5000e-01, 6.6183e-02],
        [1.9000e+01, 2.5100e+02, 2.7280e+03, 8.2000e-01, 7.2704e-02],
        [4.8000e+01, 3.3500e+02, 8.9000e+02, 7.3000e-01, 1.7361e-02],
        [2.4100e+02, 2.2900e+02, 2.4890e+03, 7.5000e-01, 6.2108e-02],
        [1.9900e+02, 2.6790e+03, 2.0720e+03, 6.1000e-01, 1.6301e-03],
        [5.8000e+02, 3.9600e+02, 2.0770e+03, 7.5000e-01, 3.0728e-02],
        [1.0200e+02, 1.3570e+03, 2.6360e+03, 6.0000e-01, 1.3041e-03],
        [1.1900e+02, 9.6200e+02, 2.1150e+03, 7.9000e-01, 6.0315e-02],
        [5.3300e+02, 9.5400e+02, 1.5210e+03, 6.1000e-01, 1.0433e-02],
        [6.0800e+02, 5.4700e+02, 7.8100e+02, 7.2000e-01, 1.0343e-01],
        [1.0200e+02, 1.2020e+03, 2.0640e+03, 6.3000e-01, 1.3595e-01],
        [4.1200e+02, 1.3640e+03, 2.1300e+03, 6.9000e-01, 1.1166e-02],
        [2.8500e+02, 3.1100e+02, 1.1290e+03, 6.5000e-01, 7.1725e-03],
        [6.4100e+02, 0.0000e+00, 1.8800e+02, 7.1000e-01, 3.5863e-03],
        [2.1

RuntimeError: ignored