<a href="https://colab.research.google.com/github/OzannStack/Personal_assist_project/blob/main/Fase_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import nltk


In [3]:
df = pd.read_csv("/content/drive/MyDrive/Food-ReviewsLLM/Reviews.csv/Reviews.csv", engine='python', on_bad_lines='skip')
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
df.dropna(subset='Text', inplace=True)
df = df[['Score', 'Text']]
df = df[df['Score'] != 3]
print(df.head())

   Score                                               Text
0      5  I have bought several of the Vitality canned d...
1      1  Product arrived labeled as Jumbo Salted Peanut...
2      4  This is a confection that has been around a fe...
3      2  If you are looking for the secret ingredient i...
4      5  Great taffy at a great price.  There was a wid...


In [5]:
df['Sentiment'] = np.where(df['Score'] > 3,1 ,0)
print(df['Sentiment'].value_counts())

Sentiment
1    443777
0     82037
Name: count, dtype: int64


In [6]:
all_text = ' '.join(df['Text']).lower()
words = all_text.split()

In [7]:
from collections import Counter
vocab = Counter(words)
vocab_size = 500
conv_vocab = {word : i + 2 for i, word in enumerate (vocab.most_common(vocab_size))}

In [8]:
conv_vocab['<PAD>'] = 0
conv_vocab['<UNK>'] = 1

In [9]:
reviews_word = []
for review in df['Text'] :
  r = [conv_vocab.get(word, conv_vocab['<UNK>']) for word in review.lower().split()]
  reviews_word.append(r)

In [10]:
SEQ = 100
def make_pad(review_int,seq_line) :
  features = np.zeros((len(review_int), seq_line), dtype=int)

  for i,row in enumerate(review_int):
    # Corrected line: assign the padded row to the corresponding row in features
    features[i, :len(row[:seq_line])] = np.array(row[:seq_line])

  return features
features = make_pad(reviews_word,SEQ)
labels = df['Sentiment'].values

In [11]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(features, labels, test_size=0.2, random_state=42)

In [12]:
import torch
train_data = torch.from_numpy(train_x)
test_data = torch.from_numpy(test_x)
train_labels = torch.from_numpy(train_y)
test_labels = torch.from_numpy(test_y)

In [13]:
from torch.utils.data import TensorDataset, DataLoader

# Gabungkan fitur dan label
train_data = TensorDataset(train_data, train_labels)
test_data = TensorDataset(test_data, test_labels)

# Buat DataLoader
BATCH_SIZE = 64
train_loader = DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_data, shuffle=False, batch_size=BATCH_SIZE)

In [14]:
import torch.nn as nn

class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size, num_layers):
        super().__init__()

        # 1. Embedding Layer: Mengubah token ID menjadi vektor padat
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # 2. LSTM Layer: Inti dari model sekuens Anda
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            # batch_first=True adalah praktik umum untuk NLP
            batch_first=True,
            dropout=0.5 # Mencegah overfitting
        )

        # 3. Fully Connected (Linear) Layer: Untuk klasifikasi
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid() # Untuk klasifikasi biner

        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        # x adalah batch of token IDs (Tensor berdimensi: [batch_size, SEQ_LEN])

        # 1. Masuk ke Embedding
        embedded = self.dropout(self.embedding(x))
        # embedded: [batch_size, SEQ_LEN, embedding_dim]

        # 2. Masuk ke LSTM
        lstm_out, (hidden, cell) = self.lstm(embedded)

        # 3. Ambil output Hidden State terakhir (yang mewakili keseluruhan ulasan)
        # Kami menggunakan output dari langkah terakhir (hidden[-1])
        out = self.dropout(hidden[-1])

        # 4. Masuk ke Fully Connected Layer
        out = self.fc(out)
        return self.sigmoid(out.squeeze())

In [15]:
import torch.optim as optim
import torch

# Inisialisasi Model, Loss, dan Optimizer
VOCAB_SIZE = 10002 # Sesuaikan dengan ukuran kamus Anda (+2 untuk <PAD> dan <UNK>)
EMBEDDING_DIM = 400
HIDDEN_DIM = 256
OUTPUT_SIZE = 1 # Karena klasifikasi biner (0 atau 1)
NUM_LAYERS = 2
N_EPOCHS = 7 # Jumlah iterasi pelatihan

model = SentimentLSTM(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_SIZE, NUM_LAYERS)

# Loss Function untuk klasifikasi biner
criterion = nn.BCELoss() # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.0005)

# Pindahkan model ke GPU jika tersedia
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.train() # Set mode ke training
for epoch in range(N_EPOCHS):
    for inputs, labels in train_loader:
        # 1. Pindahkan data ke perangkat (CPU/GPU)
        inputs, labels = inputs.to(device), labels.to(device).float()

        # 2. Nolkan gradien dari iterasi sebelumnya (WAJIB)
        optimizer.zero_grad()

        # 3. Forward Pass: Hitung output dan loss
        output = model(inputs)
        loss = criterion(output, labels)

        # 4. Backward Pass: Hitung gradien
        loss.backward()

        # 5. Update Weights: Lakukan langkah optimasi
        optimizer.step()

    print(f'Epoch {epoch+1}/{N_EPOCHS}, Loss: {loss.item():.4f}')

Epoch 1/7, Loss: 0.4326
Epoch 2/7, Loss: 0.4334
Epoch 3/7, Loss: 0.5870
Epoch 4/7, Loss: 0.3206
Epoch 5/7, Loss: 0.5190
Epoch 6/7, Loss: 0.6221
Epoch 7/7, Loss: 0.6839


In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
model.eval() # Set mode ke evaluasi (menonaktifkan dropout)
total_correct = 0
total_samples = 0

with torch.no_grad(): # Tidak perlu menghitung gradien saat evaluasi
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device).float()

        outputs = model(inputs)

        # Konversi output Sigmoid menjadi prediksi biner (0 atau 1)
        predictions = torch.round(outputs)

        total_samples += labels.size(0)
        total_correct += (predictions == labels).sum().item()

accuracy = total_correct / total_samples
print(f'\nAccuracy pada Test Set: {accuracy*100:.2f}%')


Accuracy pada Test Set: 84.43%
