## Install Package

In [None]:
!wget https://raw.githubusercontent.com/Nuvantim/Maxim_sentiment_model/refs/heads/main/requirements.txt

In [None]:
!pip install --upgrade -r requirements.txt

In [None]:
import pandas as pd
import os
import json
import numpy as np
import emoji
import nltk
import matplotlib.pyplot as plt
import logging
import seaborn as sns
import torch
import torch.nn as nn
import joblib

from sklearn.metrics import confusion_matrix, classification_report
from nltk.tokenize import word_tokenize
from collections import Counter
from gensim.models import FastText
from tqdm import tqdm
from google_play_scraper import Sort,reviews_all
from collections import Counter
from wordcloud import WordCloud
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Scraping Maxim app reviews from Google Play Store

In [None]:

from google_play_scraper import Sort, reviews_all
import pandas as pd

result = reviews_all(
    'com.taxsee.taxsee',
    lang='id'
)

df = pd.DataFrame(result)
df = df[['userName','content','score','at']]
df = df.rename(columns={'content':'review','score':'rating'})

df['at'] = pd.to_datetime(df['at'])
start_date = pd.to_datetime('2020-01-01')
end_date = pd.to_datetime('2024-12-31')
filtered_df = df[(df['at'] >= start_date) & (df['at'] <= end_date)]

filtered_df.to_csv('maxim_gplay.csv',index=False,escapechar='\\')

## Download Data

In [None]:
!wget https://github.com/Nuvantim/Maxim_sentiment_model/raw/refs/heads/main/sentiment_maxim.7z
!7z x sentiment_maxim.7z
!rm *.7z

## Import data

In [None]:
data = pd.read_csv('sentiment_maxim_gplay.csv')

In [None]:
data.head()

In [None]:
data.shape

## Filtering data

### Remove emoji & special characters

In [None]:
data['review'] = (
    data['review']
    .dropna()
    .apply(lambda s: emoji.replace_emoji(str(s), ''))
    .str.replace('[^a-zA-Z0-9]', ' ', regex=True)
    .replace('', np.nan)
)

### Remove missing value

In [None]:
data = data.dropna(subset=['review'])

In [None]:
data.shape

### Remove entries labeled as "NETRAL"

In [None]:
data = data[data['label'] != 'NETRAL']

In [None]:
data.shape

### Word tokenizer

In [None]:
nltk.download(['punkt', 'punkt_tab'])
data['token'] = data['review'].apply(word_tokenize)

In [None]:
data.head()

### Lowercase tokens

In [None]:

data = data[['review', 'token', 'label','at']]
data['token'] = data['token'].apply(lambda tokens: [t.lower() for t in tokens])

### Backup Data

In [None]:
data.to_csv("clean_data.csv", index=False)

## Data Exploration

### Number of reviews per year

In [None]:
df = data.copy()
df['at'] = pd.to_datetime(data['at'])

df['tahun'] = df['at'].dt.year

df = df[df['label'].isin(['POSITIF', 'NEGATIF'])]

count_per_year = df.groupby(['tahun', 'label']).size().unstack(fill_value=0)
print(count_per_year)


count_per_year.plot(kind='bar', figsize=(10,6))
plt.title("Number of reviews per year")
plt.xlabel("Year")
plt.ylabel("Amount")
plt.xticks(rotation=0)
plt.legend(title='Label')
plt.show()

### WordCloud

In [None]:
all_tokens = []
for tokens in tqdm(df['token'], desc="Merging tokens"):
    all_tokens.extend(tokens)

word_freq = Counter(all_tokens)

wc = WordCloud(
    width=800,
    height=400,
    background_color='white',
    colormap='viridis'
).generate_from_frequencies(word_freq)

plt.figure(figsize=(12,6))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("WordCloud of Reviews", fontsize=16)
plt.show()

## Build Fasttext Model

In [None]:
!mkdir models

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = FastText(
    sentences=data['token'],
    vector_size=500,
    window=5,
    min_count=5,
    sg=1,
    epochs=20
)

In [None]:
model.save("models/maxim_fasttext.model")

## Preprocessing Data

### Split features and labels

In [None]:

data = pd.read_csv('clean_data.csv')
data['review'] = data['review'].str.lower()

X = data['review'].astype(str)
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

### Convert reviews to embeddings

In [None]:
ft_model = FastText.load("models/maxim_fasttext.model")

In [None]:

def review_to_vec(tokens, model):
    vecs = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vecs, axis=0) if len(vecs) > 0 else np.zeros(model.vector_size)

X_train_vec = np.array([review_to_vec(text.lower().split(), ft_model) for text in X_train])
X_test_vec  = np.array([review_to_vec(text.lower().split(), ft_model) for text in X_test])

### Balance training data

In [None]:

X_under, y_under = RandomUnderSampler(
    sampling_strategy=0.7, random_state=42
).fit_resample(X_train_vec, y_train)


target_n = Counter(y_under).most_common(1)[0][1]
X_res, y_res = SMOTE(
    sampling_strategy={c: target_n for c in Counter(y_under)}, random_state=42
).fit_resample(X_under, y_under)

### Encode labels to integers

In [None]:
le = LabelEncoder()
y_res_enc = le.fit_transform(y_res)
y_test_enc = le.transform(y_test)

## Build GRU Model

### Convert Data to Pytorch Tensor

In [None]:
X_train_tensor = torch.tensor(X_res, dtype=torch.float32)
y_train_tensor = torch.tensor(y_res_enc, dtype=torch.long)
X_test_tensor  = torch.tensor(X_test_vec, dtype=torch.float32)
y_test_tensor  = torch.tensor(y_test_enc, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

### Define GRU classifier

In [None]:
class GRUClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout=0.2):
        super(GRUClassifier, self).__init__()
        self.gru = nn.GRU(
            input_size, hidden_size, num_layers,
            batch_first=True, dropout=dropout if num_layers > 1 else 0.0,
            bidirectional=True
        )
        self.fc1 = nn.Linear(hidden_size * 2, 64)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(64, num_classes)

    def forward(self, x):
        if x.dim() == 2:
            x = x.unsqueeze(1)
        out, _ = self.gru(x)
        out = out[:, -1, :]
        out = self.relu(self.fc1(out))
        out = self.dropout(out)
        out = self.fc2(out)
        return out



input_size = X_train_vec.shape[1]
hidden_size = 64
num_layers = 3
num_classes = len(le.classes_)

model = GRUClassifier(input_size, hidden_size, num_layers, num_classes, dropout=0.2)
device = torch.device('cpu')
model = model.to(device)

### Define Loss Function and Optimizer

In [None]:

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=2
)

### Train GRU model

In [None]:
num_epochs = 20
train_losses = []
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    train_losses.append(avg_loss)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}")
    scheduler.step(avg_loss)

## Evaluation

### Confusion Matrix

In [None]:
model.eval()
with torch.no_grad():
    X_test_tensor = X_test_tensor.to(device)
    outputs = model(X_test_tensor)
    y_pred = torch.argmax(outputs, dim=1).cpu().numpy()

# Confusion Matrix (visual)
cm = confusion_matrix(y_test_enc, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

print(classification_report(y_test_enc, y_pred, target_names=le.classes_))

### Plot Loss Function

In [None]:

plt.figure(figsize=(6, 5))
plt.plot(range(1, num_epochs+1), train_losses, marker='o', label="Training Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss per Epoch")
plt.legend()
plt.grid(True)
plt.show()

### Save Models

In [None]:
save_dir = "./models"
os.makedirs(save_dir, exist_ok=True)

torch.save(model.state_dict(), f"{save_dir}/pytorch_model.bin")

config_dict = {
    "model_type":"maxim-sentiment-models",
    "input_size": input_size,
    "hidden_size": hidden_size,
    "num_layers": num_layers,
    "num_classes": num_classes
}

with open(f"{save_dir}/config.json", "w") as f:
    json.dump(config_dict, f, indent=2)

joblib.dump(le, f"{save_dir}/label_encoder.pkl")

print(f"Model saved in {save_dir}")

In [None]:
!ls -lh models