In [2]:
!pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [9]:
import re
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
from sklearn.preprocessing import FunctionTransformer
import pandas as pd
import requests

# ML Pipeline

## Data loader

In [10]:
DATA_PATH = '/content/df_labeled_cleaned_final.csv'
GET_DATABASE_URL = "https://api.fadilfauzan.com/api/scraper"

def load_data_from_csv(path):
  df = pd.read_csv(path)

  return df

def load_data_from_db(url, key, idx_range):
    full_df = pd.DataFrame()
    for idx in range(0, idx_range):
        json_data = {
            "key": f'{key}_{idx}'
        }
        response = requests.get(url, json=json_data)

        if response.status_code == 200:
            data = response.json()
            df = pd.DataFrame(data)
            full_df = full_df.append(df, ignore_index=True)
        else:
            print(f"Failed to retrieve data for index {idx}")

    return full_df

## Pre-processor

In [28]:
stop_words = set(stopwords.words('indonesian'))
stemmer = StemmerFactory().create_stemmer()
non_alpha_regex = re.compile(r'[^a-zA-Z\s]')
pattern = {'gram', 'ml', 'kg', 'gr', 'pcs', 'ltr', 'liter', 'oz', 'lb', 'cc', 'pack', 'box', 'bottle', 'jar', 'can', 'piece', 'slice'}

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if not non_alpha_regex.search(token)]
    tokens = [stemmer.stem(token) for token in tokens if token not in stop_words and token not in pattern and not token.isdigit()]
    return ' '.join(tokens)

def preprocess_text_series(series):
    return series.apply(preprocess_text)

In [29]:
TEXT = "title"
LABEL = "label"

In [12]:
df = load_data_from_csv(DATA_PATH)

In [13]:
df

Unnamed: 0,title,label,link,preprocessed_text
0,️ LACTOGEN PREMATUR BBLR 400 GRAM,Legal,https://shopee.co.id/️-LACTOGEN-PREMATUR-BBLR-...,lactogen prematur bblr gram
1,‼️CUCI GUDANG‼️ BATITA 1+ 900GRAM | TANPA DUS ...,Legal,https://shopee.co.id/‼️CUCI-GUDANG‼️-BATITA-1-...,cuci gudang batita gram dus madu susu formula ...
2,‼️CUCI GUDANG‼️ BATITA 1+ 900GRAM | VANILA | 1...,Legal,https://shopee.co.id/‼️CUCI-GUDANG‼️-BATITA-1-...,cuci gudang batita gram vanila susu formula tu...
3,‼️CUCI GUDANG‼️ DATITA 3+ 900GRAM | TANPA DUS ...,Legal,https://shopee.co.id/‼️CUCI-GUDANG‼️-DATITA-3-...,cuci gudang datita gram dus madu susu formula ...
4,‼️CUCI GUDANG‼️ DATITA 3+ 900GRAM | VANILA | 3...,Legal,https://shopee.co.id/‼️CUCI-GUDANG‼️-DATITA-3-...,cuci gudang datita gram vanila susu formula tu...
...,...,...,...,...
2754,Ternak Syams - Premium Susu Kambing Etawa Kolo...,Ilegal,https://shopee.co.id/Ternak-Syams-Premium-Susu...,ternak syams premium susu kambing etawa kolost...
2755,Totole kaldu jamur 200 gram,Legal,https://shopee.co.id/Totole-kaldu-jamur-200-gr...,totole kaldu jamur gram
2756,Trieste Italian Syrup 650mL - Coffee Syrup : C...,Ilegal,https://shopee.co.id/Trieste-Italian-Syrup-650...,trieste italian syrup ml coffee syrup caramel ...
2757,Wilmond Mandarin Orange in Syrup 312gr - Buah ...,Legal,https://shopee.co.id/Wilmond-Mandarin-Orange-i...,wilmond mandarin orange in syrup gr buah jeruk...


In [30]:
X_train, X_valid, train_y, valid_y = train_test_split(df[TEXT], df[LABEL], random_state=42, stratify=df[LABEL], test_size=0.2)

In [31]:
label_mapping = {'Legal': 0, 'Ilegal': 1}
y_train = [label_mapping[label] for label in train_y]
y_valid = [label_mapping[label] for label in valid_y]

In [32]:
X_train.shape

(2207,)

In [33]:
X_train

315      Arak Masak Merah AT Cap Lonceng/Angciu AT 600 ml
2236            Snack Ori Branded TERMURAH!!! Jetz 250 gr
992                             GROOVYROOTBEER CAN 330 ML
874     Enfamil HMF Human Milk Fortifier BOX (Suplemen...
637     Cereal Nutriflakes - Original Sereal Umbi Garu...
                              ...                        
1443    Minuman Tradisional Kahs Bali kemasan 600ml Ca...
390                                  Batavia Whisky 350ml
678     Coco Macaron Sandwich biscuit impor coco macar...
907     FLIMTY BOX / ALL VARIAN /  READY BOX & SACHET ...
1924    S-26 HMF GOLD (S26 Human Milk Fortifier) Pelen...
Name: title, Length: 2207, dtype: object

## Training Pipeline

In [54]:
preprocess_transformer = FunctionTransformer(preprocess_text_series, validate=False)

In [55]:
pipeline = Pipeline([
    ('preprocess', preprocess_transformer),
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

In [56]:
pipeline.fit(X_train, y_train)

predictions = pipeline.predict(X_valid)
print(classification_report(y_valid, predictions))

              precision    recall  f1-score   support

           0       0.86      0.81      0.83       261
           1       0.84      0.88      0.86       291

    accuracy                           0.85       552
   macro avg       0.85      0.84      0.84       552
weighted avg       0.85      0.85      0.85       552



# Use pipeline for prediction

In [57]:
pipeline.named_steps['preprocess']

In [58]:
pr = pipeline.named_steps['preprocess'].transform(df['title'])

In [59]:
pr

0                                  lactogen prematur bblr
1          batita dus madu susu formula tumbuh anak bubuk
2            batita vanila susu formula tumbuh anak bubuk
3          datita dus madu susu formula tumbuh anak bubuk
4            datita vanila susu formula tumbuh anak bubuk
                              ...                        
2754    ternak syams premium susu kambing etawa kolost...
2755                                   totole kaldu jamur
2756    trieste italian syrup coffee syrup caramel van...
2757    wilmond mandarin orange in syrup buah jeruk ma...
2758                                yomas kornet ayam isi
Name: title, Length: 2759, dtype: object

In [60]:
food_titles_indonesia = [
    "Beras Organik",
    "Minyak Kelapa Murni",
    "Ikan Tongkol Segar",
    "Dendeng Sapi Kualitas Premium",
    "Pilihan Keju Lokal",
    "Truffle Cokelat Handcrafted",
    "Telur Ayam Kampung",
    "Sayuran Organik",
    "Bumbu Nasi Goreng Instan",
    "Garam Laut Himalaya",
    "Madu Murni",
    "Jamur Organik Pilihan",
    "Roti Gandum Organik",
    "Telur Ayam Organik",
    "Jus Hijau Segar",
    "Protein Nabati",
    "Granola Berbiji",
    "Madu Lokal Pilihan",
    "Sirup Mapel Kualitas Terbaik",
    "Pasta Artisanal"
]

# Creating the DataFrame
df_dummy = pd.DataFrame({"title": food_titles_indonesia})

In [61]:
df_dummy.head()

Unnamed: 0,title
0,Beras Organik
1,Minyak Kelapa Murni
2,Ikan Tongkol Segar
3,Dendeng Sapi Kualitas Premium
4,Pilihan Keju Lokal


In [62]:
pr = pipeline.named_steps['preprocess'].transform(df_dummy['title'])

In [63]:
pr

0                     beras organik
1               minyak kelapa murni
2                ikan tongkol segar
3     dendeng sapi kualitas premium
4                  pilih keju lokal
5       truffle cokelat handcrafted
6                telur ayam kampung
7                     sayur organik
8          bumbu nasi goreng instan
9               garam laut himalaya
10                       madu murni
11              jamur organik pilih
12              roti gandum organik
13               telur ayam organik
14                  jus hijau segar
15                   protein nabati
16                     granola biji
17                 madu lokal pilih
18        sirup mapel kualitas baik
19                  pasta artisanal
Name: title, dtype: object

## Main predict funtion

In [64]:
def prediction(df):
  df['label_pred'] = pipeline.predict(df['title'])
  return df

In [65]:
df = load_data_from_csv(DATA_PATH)
df = prediction(df)
df

Unnamed: 0,title,label,link,preprocessed_text,label_pred
0,️ LACTOGEN PREMATUR BBLR 400 GRAM,Legal,https://shopee.co.id/️-LACTOGEN-PREMATUR-BBLR-...,lactogen prematur bblr gram,0
1,‼️CUCI GUDANG‼️ BATITA 1+ 900GRAM | TANPA DUS ...,Legal,https://shopee.co.id/‼️CUCI-GUDANG‼️-BATITA-1-...,cuci gudang batita gram dus madu susu formula ...,0
2,‼️CUCI GUDANG‼️ BATITA 1+ 900GRAM | VANILA | 1...,Legal,https://shopee.co.id/‼️CUCI-GUDANG‼️-BATITA-1-...,cuci gudang batita gram vanila susu formula tu...,0
3,‼️CUCI GUDANG‼️ DATITA 3+ 900GRAM | TANPA DUS ...,Legal,https://shopee.co.id/‼️CUCI-GUDANG‼️-DATITA-3-...,cuci gudang datita gram dus madu susu formula ...,0
4,‼️CUCI GUDANG‼️ DATITA 3+ 900GRAM | VANILA | 3...,Legal,https://shopee.co.id/‼️CUCI-GUDANG‼️-DATITA-3-...,cuci gudang datita gram vanila susu formula tu...,0
...,...,...,...,...,...
2754,Ternak Syams - Premium Susu Kambing Etawa Kolo...,Ilegal,https://shopee.co.id/Ternak-Syams-Premium-Susu...,ternak syams premium susu kambing etawa kolost...,1
2755,Totole kaldu jamur 200 gram,Legal,https://shopee.co.id/Totole-kaldu-jamur-200-gr...,totole kaldu jamur gram,0
2756,Trieste Italian Syrup 650mL - Coffee Syrup : C...,Ilegal,https://shopee.co.id/Trieste-Italian-Syrup-650...,trieste italian syrup ml coffee syrup caramel ...,1
2757,Wilmond Mandarin Orange in Syrup 312gr - Buah ...,Legal,https://shopee.co.id/Wilmond-Mandarin-Orange-i...,wilmond mandarin orange in syrup gr buah jeruk...,0


In [66]:
df['label'].value_counts()

label
Ilegal    1452
Legal     1307
Name: count, dtype: int64

In [67]:
pipeline.steps

[('preprocess',
  FunctionTransformer(func=<function preprocess_text_series at 0x79083ab58f70>)),
 ('tfidf', TfidfVectorizer()),
 ('rf', RandomForestClassifier())]

In [68]:
import pickle

In [69]:
for step_name, step_obj in pipeline.steps:
    with open(f'{step_name}.pkl', 'wb') as f:
        pickle.dump(step_obj, f)

In [70]:
from joblib import dump, load
dump(pipeline, 'pipeline.joblib')
loaded_pipeline = load('pipeline.joblib')

In [71]:
loaded_pipeline

In [72]:
pr = loaded_pipeline.named_steps['preprocess'].transform(df_dummy['title'])

In [73]:
pr

0                     beras organik
1               minyak kelapa murni
2                ikan tongkol segar
3     dendeng sapi kualitas premium
4                  pilih keju lokal
5       truffle cokelat handcrafted
6                telur ayam kampung
7                     sayur organik
8          bumbu nasi goreng instan
9               garam laut himalaya
10                       madu murni
11              jamur organik pilih
12              roti gandum organik
13               telur ayam organik
14                  jus hijau segar
15                   protein nabati
16                     granola biji
17                 madu lokal pilih
18        sirup mapel kualitas baik
19                  pasta artisanal
Name: title, dtype: object