# Pelatihan Model

## Import library

In [56]:
import pandas as pd
import re
import string
import time
import gensim
import numpy as np

from gensim.models import Word2Vec
from tqdm import tqdm
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from googletrans import Translator
from textblob import TextBlob

# Data Pre-Processing

## df Loading

In [2]:
import pandas as pd

df = pd.read_csv("female_daily_wardah_reviews.csv")

## df Cleaning

In [3]:
# Hapus kolom tidak diperlukan
cols_to_drop = ['product_url', 'brand_name', 'username', 'user_profile_url', 'overall_rating', 'total_reviews', 'page_number']
df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4980 entries, 0 to 4979
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   product_name    4980 non-null   object
 1   user_age        4974 non-null   object
 2   review_date     4980 non-null   object
 3   rating          4980 non-null   int64 
 4   is_recommended  4980 non-null   bool  
 5   review_text     4976 non-null   object
 6   usage_period    4980 non-null   object
 7   purchase_point  4979 non-null   object
dtypes: bool(1), int64(1), object(6)
memory usage: 277.3+ KB


In [4]:
# Cek Null
df.isna().sum()

product_name      0
user_age          6
review_date       0
rating            0
is_recommended    0
review_text       4
usage_period      0
purchase_point    1
dtype: int64

In [5]:
# Cek duplicate
df.duplicated().sum()

4

In [6]:
# hapus null
df = df.dropna()
df.isna().sum()

product_name      0
user_age          0
review_date       0
rating            0
is_recommended    0
review_text       0
usage_period      0
purchase_point    0
dtype: int64

In [7]:
# hapus duplicate
df = df.drop_duplicates()
df.duplicated().sum()

0

In [8]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [9]:
df['cleaned_review'] = df['review_text'].apply(clean_text)

## Translating and Labeling

In [10]:
translator = Translator()
translations = {}

# Fungsi dengan retry
def translate_with_retry(text, retries=3, delay=2):
    for i in range(retries):
        try:
            return translator.translate(text, src='id', dest='en').text
        except Exception as e:
            print(f"[Retry {i+1}] Error translating '{text[:30]}...': {e}")
            time.sleep(delay)
    return text  # fallback jika semua percobaan gagal

# Hilangkan duplikasi untuk mempercepat proses
unique_texts = df['cleaned_review'].unique()

# Progress bar untuk proses translasi
for text in tqdm(unique_texts, desc="Translating"):
    if text not in translations:
        translations[text] = translate_with_retry(text)

# Mapping hasil translate ke DataFrame
df['translated_text'] = df['cleaned_review'].map(translations)
df['translated_text'] = df['translated_text'].str.replace(r'([^0-9A-Za-z \t])|(\w+:\/\/\S+)', ' ', regex=True)
df['translated_text'] = df['translated_text'].str.lower()

df.head()

Translating:  68%|██████▊   | 3345/4931 [1:02:34<37:10,  1.41s/it]

[Retry 1] Error translating 'beli ini garagara pengen cari ...': The read operation timed out


Translating: 100%|██████████| 4931/4931 [1:35:16<00:00,  1.16s/it]  


Unnamed: 0,product_name,user_age,review_date,rating,is_recommended,review_text,usage_period,purchase_point,cleaned_review,translated_text
0,UV Shield Essential Sunscreen Gel SPF 35 PA +++,25 - 29,an hour ago,5,True,"Sunscreen yang punya tekstur dingin, ringan da...",6 months - 1 year,Shopee,sunscreen yang punya tekstur dingin ringan dan...,sunscreen which has a lightweight cold texture...
1,UV Shield Essential Sunscreen Gel SPF 35 PA +++,19 - 24,a day ago,5,True,"Sudah lama pakai sunscreen wardah inii, dari p...",More than 1 year,Shopee,sudah lama pakai sunscreen wardah inii dari pa...,has long used this wardah sunscreen from the o...
2,UV Shield Essential Sunscreen Gel SPF 35 PA +++,19 - 24,a day ago,4,True,"teksturnya tidak lengket, cepat meresap di kul...",3 months - 6 months,Shopee,teksturnya tidak lengket cepat meresap di kuli...,the texture is not sticky quickly absorbing on...
3,UV Shield Essential Sunscreen Gel SPF 35 PA +++,19 - 24,a day ago,5,True,Ga mau berpaling deh dari sunscreen yang satu ...,More than 1 year,Shopee,ga mau berpaling deh dari sunscreen yang satu ...,i don t want to turn away from this sunscreen ...
4,UV Shield Essential Sunscreen Gel SPF 35 PA +++,40 - 44,3 days ago,5,True,Wardah UV Shield Essential Gel Sunscreen SPF 3...,6 months - 1 year,Shopee,wardah uv shield essential gel sunscreen spf a...,wardah uv shield essential gel sunscreen spf i...


In [11]:
def subjektivitas(tr_text):
  return TextBlob(tr_text).sentiment.subjectivity

def polaritas(tr_text):
  return TextBlob(tr_text).sentiment.polarity

def hasilSentimen(nilai):
  if nilai < 0:
    return 'negatif'
  elif nilai == 0:
    return 'netral'
  else:
    return 'positif'

df['subjektivitas'] = df['translated_text'].apply(subjektivitas)
df['polaritas'] = df['translated_text'].apply(polaritas)
df['sentimen'] = df['polaritas'].apply(hasilSentimen)

df.head()

Unnamed: 0,product_name,user_age,review_date,rating,is_recommended,review_text,usage_period,purchase_point,cleaned_review,translated_text,subjektivitas,polaritas,sentimen
0,UV Shield Essential Sunscreen Gel SPF 35 PA +++,25 - 29,an hour ago,5,True,"Sunscreen yang punya tekstur dingin, ringan da...",6 months - 1 year,Shopee,sunscreen yang punya tekstur dingin ringan dan...,sunscreen which has a lightweight cold texture...,0.877778,0.177778,positif
1,UV Shield Essential Sunscreen Gel SPF 35 PA +++,19 - 24,a day ago,5,True,"Sudah lama pakai sunscreen wardah inii, dari p...",More than 1 year,Shopee,sudah lama pakai sunscreen wardah inii dari pa...,has long used this wardah sunscreen from the o...,0.385714,0.178571,positif
2,UV Shield Essential Sunscreen Gel SPF 35 PA +++,19 - 24,a day ago,4,True,"teksturnya tidak lengket, cepat meresap di kul...",3 months - 6 months,Shopee,teksturnya tidak lengket cepat meresap di kuli...,the texture is not sticky quickly absorbing on...,0.727083,0.085417,positif
3,UV Shield Essential Sunscreen Gel SPF 35 PA +++,19 - 24,a day ago,5,True,Ga mau berpaling deh dari sunscreen yang satu ...,More than 1 year,Shopee,ga mau berpaling deh dari sunscreen yang satu ...,i don t want to turn away from this sunscreen ...,0.54,0.18,positif
4,UV Shield Essential Sunscreen Gel SPF 35 PA +++,40 - 44,3 days ago,5,True,Wardah UV Shield Essential Gel Sunscreen SPF 3...,6 months - 1 year,Shopee,wardah uv shield essential gel sunscreen spf a...,wardah uv shield essential gel sunscreen spf i...,0.65,0.25,positif


In [12]:
df['sentimen'].value_counts()

sentimen
positif    4338
negatif     547
netral       80
Name: count, dtype: int64

In [13]:
df.to_csv("data_labeled.csv")

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4965 entries, 0 to 4979
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   product_name     4965 non-null   object 
 1   user_age         4965 non-null   object 
 2   review_date      4965 non-null   object 
 3   rating           4965 non-null   int64  
 4   is_recommended   4965 non-null   bool   
 5   review_text      4965 non-null   object 
 6   usage_period     4965 non-null   object 
 7   purchase_point   4965 non-null   object 
 8   cleaned_review   4965 non-null   object 
 9   translated_text  4965 non-null   object 
 10  subjektivitas    4965 non-null   float64
 11  polaritas        4965 non-null   float64
 12  sentimen         4965 non-null   object 
dtypes: bool(1), float64(2), int64(1), object(9)
memory usage: 509.1+ KB


## Data Extraction

### Dengan TF-IDF

In [16]:
X = df['translated_text']
y = df['sentimen']

In [17]:
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

In [18]:
print(X_tfidf.shape)
print(X_tfidf[0])

(4965, 5000)
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 20 stored elements and shape (1, 5000)>
  Coords	Values
  (0, 4135)	0.18240422568933085
  (0, 4780)	0.1746025891863025
  (0, 2014)	0.1651746306435334
  (0, 2397)	0.2527930028182228
  (0, 1077)	0.2704927345557083
  (0, 4303)	0.11251472705542259
  (0, 303)	0.12866888807156715
  (0, 1548)	0.20420869394832555
  (0, 37)	0.257443105144833
  (0, 2228)	0.24722917954495077
  (0, 4319)	0.050577901998890784
  (0, 3703)	0.08942617037952708
  (0, 1611)	0.34545763138040675
  (0, 4823)	0.13383850490455618
  (0, 1482)	0.3973537689347707
  (0, 3706)	0.41650708921064933
  (0, 1803)	0.07749784059520891
  (0, 669)	0.18257084984838107
  (0, 1692)	0.103256380470363
  (0, 1928)	0.2107053128366566


In [20]:
vectorizer.vocabulary_

{'sunscreen': 4135,
 'which': 4780,
 'has': 2014,
 'lightweight': 2397,
 'cold': 1077,
 'texture': 4303,
 'and': 303,
 'easily': 1548,
 'absorbs': 37,
 'into': 2228,
 'the': 4319,
 'skin': 3703,
 'equipped': 1611,
 'with': 4823,
 'dna': 1482,
 'skinboost': 3706,
 'for': 1803,
 'bright': 669,
 'face': 1692,
 'glowing': 1928,
 'long': 2435,
 'used': 4543,
 'this': 4343,
 'wardah': 4676,
 'from': 1845,
 'old': 2771,
 'packaging': 2843,
 'to': 4378,
 'white': 4786,
 'color': 1086,
 'until': 4519,
 'initial': 2205,
 'upgraded': 4528,
 'cobacobaa': 1058,
 'finally': 1754,
 'settling': 3556,
 'now': 2737,
 'kalii': 2273,
 'is': 2236,
 'good': 1946,
 'hopefully': 2101,
 'better': 521,
 'quality': 3100,
 'not': 2728,
 'sticky': 3981,
 'quickly': 3101,
 'absorbing': 36,
 'on': 2777,
 'does': 1485,
 'cause': 854,
 'whitecasts': 4789,
 'be': 484,
 'flattened': 1778,
 'entire': 1607,
 'surface': 4158,
 'of': 2752,
 'it': 2238,
 'doesn': 1486,
 'make': 2492,
 'too': 4392,
 'dull': 1535,
 'oily': 276

In [21]:
tfidf_df = pd.DataFrame(
    X_tfidf.toarray(), 
    columns=vectorizer.get_feature_names_out()
)

In [22]:
df_full_tfidf = pd.concat([df.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

In [25]:
df_full_tfidf.head()

Unnamed: 0,product_name,user_age,review_date,rating,is_recommended,review_text,usage_period,purchase_point,cleaned_review,translated_text,...,zag,zebra,zig,zit,zits,zombies,zone,zoneini,zonk,zuzur
0,UV Shield Essential Sunscreen Gel SPF 35 PA +++,25 - 29,an hour ago,5,True,"Sunscreen yang punya tekstur dingin, ringan da...",6 months - 1 year,Shopee,sunscreen yang punya tekstur dingin ringan dan...,sunscreen which has a lightweight cold texture...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,UV Shield Essential Sunscreen Gel SPF 35 PA +++,19 - 24,a day ago,5,True,"Sudah lama pakai sunscreen wardah inii, dari p...",More than 1 year,Shopee,sudah lama pakai sunscreen wardah inii dari pa...,has long used this wardah sunscreen from the o...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,UV Shield Essential Sunscreen Gel SPF 35 PA +++,19 - 24,a day ago,4,True,"teksturnya tidak lengket, cepat meresap di kul...",3 months - 6 months,Shopee,teksturnya tidak lengket cepat meresap di kuli...,the texture is not sticky quickly absorbing on...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,UV Shield Essential Sunscreen Gel SPF 35 PA +++,19 - 24,a day ago,5,True,Ga mau berpaling deh dari sunscreen yang satu ...,More than 1 year,Shopee,ga mau berpaling deh dari sunscreen yang satu ...,i don t want to turn away from this sunscreen ...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,UV Shield Essential Sunscreen Gel SPF 35 PA +++,40 - 44,3 days ago,5,True,Wardah UV Shield Essential Gel Sunscreen SPF 3...,6 months - 1 year,Shopee,wardah uv shield essential gel sunscreen spf a...,wardah uv shield essential gel sunscreen spf i...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Using Word2Vec

In [40]:
df['tokens'] = df['translated_text'].apply(lambda x: x.split())

In [41]:
w2v_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=2, workers=4)

In [44]:
def document_vector(doc):
    doc = [word for word in doc if word in w2v_model.wv.index_to_key]
    if len(doc) == 0:
        return np.zeros(100)
    return np.mean(w2v_model.wv[doc], axis=0)

X_w2v = df['tokens'].apply(document_vector).to_list()
X_w2v = np.array(X_w2v)

y = df['sentimen']

## Training Model

### Using TF-IDF and SVM

In [28]:
# Split data train-test 80/20
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42, stratify=y)

In [31]:
model = SVC(kernel='linear', class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [32]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8539778449144008
              precision    recall  f1-score   support

     negatif       0.42      0.66      0.51       109
      netral       0.64      0.56      0.60        16
     positif       0.95      0.88      0.92       868

    accuracy                           0.85       993
   macro avg       0.67      0.70      0.68       993
weighted avg       0.89      0.85      0.87       993



### Using TF-IDF and Random Forest

In [48]:
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.3, random_state=42, stratify=y
)

In [49]:
model_rf_tf = RandomForestClassifier(random_state=42, class_weight='balanced')
model_rf_tf.fit(X_train, y_train)

In [50]:
y_pred = model_rf_tf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))

Accuracy: 0.8785234899328859
              precision    recall  f1-score   support

     negatif       1.00      0.01      0.01       164
      netral       0.80      0.33      0.47        24
     positif       0.88      1.00      0.93      1302

    accuracy                           0.88      1490
   macro avg       0.89      0.45      0.47      1490
weighted avg       0.89      0.88      0.83      1490



### Using Word2Vec and Random Forest

In [51]:
X_train, X_test, y_train, y_test = train_test_split(
    X_w2v, y, test_size=0.2, random_state=42, stratify=y
)

In [52]:
model_rf_w2v = RandomForestClassifier(random_state=42, class_weight='balanced')
model_rf_w2v.fit(X_train, y_train)

In [53]:
y_pred = model_rf_w2v.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))

Accuracy: 0.8761329305135952
              precision    recall  f1-score   support

     negatif       0.67      0.02      0.04       109
      netral       1.00      0.06      0.12        16
     positif       0.88      1.00      0.93       868

    accuracy                           0.88       993
   macro avg       0.85      0.36      0.36       993
weighted avg       0.86      0.88      0.82       993



## Inference Model

In [67]:
def translate_text(text, src='id', dest='en'):
    try:
        translated = translator.translate(text, src=src, dest=dest)
        return translated.text
    except Exception as e:
        print(f"Translate error: {e}")
        return text

def preprocess_text(text):
    return text.lower().strip()

def predict_svm_tfidf_translated(text):
    translated = translate_text(text)
    processed = preprocess_text(translated)
    vector = vectorizer.transform([processed])
    return model.predict(vector)[0]

def predict_rf_tfidf_translated(text):
    translated = translate_text(text)
    processed = preprocess_text(translated)
    vector = vectorizer.transform([processed])
    return model_rf_tf.predict(vector)[0]

def predict_rf_w2v_translated(text):
    translated = translate_text(text)
    processed = preprocess_text(translated)
    tokens = processed.split()
    vector = document_vector(tokens).reshape(1, -1)
    return model_rf_w2v.predict(vector)[0]

In [68]:
if __name__ == "__main__":
    test_texts = [
        "Produk ini sangat bagus dan saya suka memakainya.",
        "Tidak sesuai dengan ekspektasi saya, kualitasnya buruk.",
        "Biasa saja, tidak terlalu mengecewakan tapi juga tidak istimewa."
    ]

    print("\n=== Prediksi Sentimen ===")
    for i, text in enumerate(test_texts, 1):
        print(f"\n[Sample {i}] {text}")
        print("SVM + TF-IDF     :", predict_svm_tfidf_translated(text))
        print("RF  + TF-IDF     :", predict_rf_tfidf_translated(text))
        print("RF  + Word2Vec   :", predict_rf_w2v_translated(text))


=== Prediksi Sentimen ===

[Sample 1] Produk ini sangat bagus dan saya suka memakainya.
SVM + TF-IDF     : positif
RF  + TF-IDF     : positif
RF  + Word2Vec   : positif

[Sample 2] Tidak sesuai dengan ekspektasi saya, kualitasnya buruk.
SVM + TF-IDF     : negatif
RF  + TF-IDF     : positif
RF  + Word2Vec   : positif

[Sample 3] Biasa saja, tidak terlalu mengecewakan tapi juga tidak istimewa.
SVM + TF-IDF     : netral
RF  + TF-IDF     : positif
RF  + Word2Vec   : positif
