## 4. Load Dataset & Informasi Awal

In [None]:
# Import library utama
import pandas as pd

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data science/UAS/Recipe Reviews and User Feedback Dataset.csv')

# Tampilkan 5 data pertama
df.head()


In [None]:
# Informasi struktur dataset
df.info()

In [None]:
# Jumlah baris dan kolom
print("Jumlah baris:", df.shape[0])
print("Jumlah kolom:", df.shape[1])

In [None]:
# Cek missing values
df.isnull().sum()

In [None]:
# Cek jumlah data duplikat
df.duplicated().sum()

In [None]:
df['stars'].value_counts().sort_index()

In [None]:
df['stars'].value_counts(normalize=True).sort_index() * 100

### Exploratory Data Analysis (EDA)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='stars', data=df)
plt.title('Distribusi Rating Bintang')
plt.xlabel('Rating')
plt.ylabel('Jumlah Komentar')
plt.show()


In [None]:
df['text_length'] = df['text'].astype(str).apply(len)

plt.figure(figsize=(8,5))
sns.histplot(df['text_length'], bins=50, kde=True)
plt.title('Distribusi Panjang Teks Ulasan')
plt.xlabel('Jumlah Karakter')
plt.ylabel('Frekuensi')
plt.show()


In [None]:
from wordcloud import WordCloud

text_all = ' '.join(df['text'].dropna())

wordcloud = WordCloud(
    width=800,
    height=400,
    background_color='white'
).generate(text_all)

plt.figure(figsize=(12,6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('WordCloud Ulasan Resep Masakan')
plt.show()


### 5.DATA PREPARATION

In [None]:
#@title Data Cleaning

# Menghapus data dengan text kosong
df = df.dropna(subset=['text'])

print("Jumlah data setelah cleaning:", df.shape[0])


In [None]:
df = df.drop(columns=['Unnamed: 0'])


In [None]:
#@title Labeling Sentimen dari Rating (stars)

#Dataset tidak memiliki label sentimen eksplisit, sehingga dilakukan konversi rating bintang menjadi label sentimen.

# Labeling sentimen dari stars
def label_sentiment(star):
    if star >= 4:
        return 1   # Positif
    else:
        return 0   # Negatif

df['sentiment'] = df['stars'].apply(label_sentiment)


In [None]:
#@title Text Preprocessing

import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text'] = df['text'].astype(str).apply(clean_text)



In [None]:
#@title Data Splitting
from sklearn.model_selection import train_test_split

X = df['clean_text']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [None]:
#@title Feature Extraction – TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1,2),
    min_df=5,
    max_df=0.9
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [None]:
#@title Tokenization & Padding (Untuk LSTM)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 8000
max_len = 80

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(
    tokenizer.texts_to_sequences(X_train),
    maxlen=max_len,
    padding='post'
)

X_test_seq = pad_sequences(
    tokenizer.texts_to_sequences(X_test),
    maxlen=max_len,
    padding='post'
)

vocab_size = min(max_words, len(tokenizer.word_index) + 1)


In [None]:
#@title Class Weighting (Imbalanced Data)

from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(y_train)
weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=y_train
)

class_weights = dict(zip(classes, weights))
class_weights


6. Modeling

#**6.1 Model 1 – Baseline Model (Logistic Regression)**

**Hyperparameter**

- C: 1.0
- solver: lbfgs
- max_iter: 1000
- class_weight: balanced

In [None]:
#@title Implementasi
from sklearn.linear_model import LogisticRegression

logreg_model = LogisticRegression(
    C=2.0,
    max_iter=2000,
    class_weight='balanced',
    random_state=42
)

logreg_model.fit(X_train_tfidf, y_train)
y_pred_logreg = logreg_model.predict(X_test_tfidf)


#**6.2 Model 2 – Advanced Machine Learning Model (Random Forest)**

**Hyperparameter**
- n_estimators: 200
- max_depth: 20
- min_samples_split: 5
- class_weight: balanced
- random_state: 42


In [None]:
#@title Implementasi

from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=25,
    min_samples_split=10,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_tfidf, y_train)
y_pred_rf = rf_model.predict(X_test_tfidf)


#**6.3 Model 3 – Deep Learning Model (LSTM)**

**Arsitektur Model**
1. Embedding Layer (vocab_size, 128)
2. LSTM Layer (128 units, return_sequences=False)
3. Dropout (0.5)
4. Dense Layer (64 units, ReLU)
5. Dropout (0.3)
6. Output Layer (Softmax)


**Input & Preprocessing**
- Input shape: (sequence_length,)
- Preprocessing khusus:
  - Tokenization
  - Padding sequences
  - Label encoding target

In [None]:
#@title Implementasi

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam


vocab_size = min(10000, len(tokenizer.word_index) + 1)
max_len = X_train_seq.shape[1]

lstm_model = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=128,
        input_length=max_len
    ),
    Bidirectional(LSTM(64)),
    Dropout(0.4),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])


lstm_model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy']
)


early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

history = lstm_model.fit(
    X_train_seq,
    y_train,
    validation_split=0.2,
    epochs=15,
    batch_size=64,
    class_weight=class_weights,
    callbacks=[early_stopping],
    verbose=1
)



In [None]:
#@title Model Summary

lstm_model.summary()


### 7. Evaluation

In [None]:
#@title Import
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)

### Model 1 -Logistic Regression

In [None]:
#@title Metrik Evaluasi

acc_lr = accuracy_score(y_test, y_pred_logreg)
prec_lr = precision_score(y_test, y_pred_logreg, average='weighted')
rec_lr = recall_score(y_test, y_pred_logreg, average='weighted')
f1_lr = f1_score(y_test, y_pred_logreg, average='weighted')

print("=== Logistic Regression ===")
print(f"Accuracy  : {acc_lr:.4f}")
print(f"Precision : {prec_lr:.4f}")
print(f"Recall    : {rec_lr:.4f}")
print(f"F1-Score  : {f1_lr:.4f}")


In [None]:
#@title Confusion Matrix

cm_lr = confusion_matrix(y_test, y_pred_logreg)

plt.figure(figsize=(5,4))
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Logistic Regression")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
#@title Classification Report

print(classification_report(y_test, y_pred_logreg))


## Model 2 – Random Forest

In [None]:
#@title Metrik Evaluasi

acc_rf = accuracy_score(y_test, y_pred_rf)
prec_rf = precision_score(y_test, y_pred_rf, average='weighted')
rec_rf = recall_score(y_test, y_pred_rf, average='weighted')
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')

print("=== Random Forest ===")
print(f"Accuracy  : {acc_rf:.4f}")
print(f"Precision : {prec_rf:.4f}")
print(f"Recall    : {rec_rf:.4f}")
print(f"F1-Score  : {f1_rf:.4f}")


In [None]:
#@title Confusion Matrix
cm_rf = confusion_matrix(y_test, y_pred_rf)

plt.figure(figsize=(5,4))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Greens')
plt.title("Confusion Matrix - Random Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
#@title Classification Report
print(classification_report(y_test, y_pred_rf))


## Model 3 – LSTM

In [None]:
y_pred_lstm_prob = lstm_model.predict(X_test_seq)
y_pred_lstm = (y_pred_lstm_prob > 0.5).astype(int).ravel()

In [None]:
#@title Metrik Evaluasi

acc_lstm = accuracy_score(y_test, y_pred_lstm)
prec_lstm = precision_score(y_test, y_pred_lstm, average='weighted')
rec_lstm = recall_score(y_test, y_pred_lstm, average='weighted')
f1_lstm = f1_score(y_test, y_pred_lstm, average='weighted')

print("=== LSTM ===")
print(f"Accuracy  : {acc_lstm:.4f}")
print(f"Precision : {prec_lstm:.4f}")
print(f"Recall    : {rec_lstm:.4f}")
print(f"F1-Score  : {f1_lstm:.4f}")

In [None]:
#@title Confusion Matrix
cm_lstm = confusion_matrix(y_test, y_pred_lstm)

plt.figure(figsize=(5,4))
sns.heatmap(cm_lstm, annot=True, fmt='d', cmap='Oranges')
plt.title("Confusion Matrix - LSTM")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
#@title Classification Report
print(classification_report(y_test, y_pred_lstm))


# Plot Training History LSTM

In [None]:
#@title Loss
plt.figure(figsize=(6,4))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training vs Validation Loss (LSTM)')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
#@title Accuracy
plt.figure(figsize=(6,4))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training vs Validation Accuracy (LSTM)')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


In [None]:
#@title Tabel Perbandingan Model (Output Gambar)

import matplotlib.pyplot as plt

# Data tabel (pastikan variabel ini SUDAH ADA dari evaluasi sebelumnya)
models = ["Logistic Regression", "Random Forest", "LSTM"]
accuracy = [acc_lr, acc_rf, acc_lstm]
precision = [prec_lr, prec_rf, prec_lstm]
recall = [rec_lr, rec_rf, acc_lstm]
f1 = [f1_lr, f1_rf, f1_lstm]

# Membuat figure
fig, ax = plt.subplots(figsize=(9, 3))
ax.axis('tight')
ax.axis('off')

# Isi tabel
table_data = [
    [models[i],
     f"{accuracy[i]:.3f}",
     f"{precision[i]:.3f}",
     f"{recall[i]:.3f}",
     f"{f1[i]:.3f}"]
    for i in range(len(models))
]

columns = ["Model", "Accuracy", "Precision", "Recall", "F1-Score"]

table = ax.table(
    cellText=table_data,
    colLabels=columns,
    loc='center',
    cellLoc='center'
)

# Styling tabel
table.scale(1, 1.6)
table.auto_set_font_size(False)
table.set_fontsize(11)

plt.title("Perbandingan Performa Model", fontsize=13, pad=10)
plt.show()


# Meyimpan Model

In [None]:
#@title Simpan Model Logistic Regression

import joblib

joblib.dump(logreg_model, "model_baseline.pkl")

print("Logistic Regression berhasil disimpan model_baseline.pkl")


In [None]:
#@title Simpan Model Random Forest

joblib.dump(rf_model, "model_rf.pkl")

print("Random Forest berhasil disimpan model_rf.pkl")


In [None]:
#@title Simpan Model LSTM

lstm_model.save("model_lstm.keras")

print("LSTM berhasil disimpan model_lstm.keras")
