# data collection

In [1]:
import pandas as pd

In [213]:
train = pd.read_csv('Data_latih.csv')
test = pd.read_csv('Data_uji.csv')

In [214]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4231 entries, 0 to 4230
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ID                4231 non-null   int64 
 1   label             4231 non-null   int64 
 2   tanggal           4231 non-null   object
 3   judul             4231 non-null   object
 4   narasi            4231 non-null   object
 5   nama file gambar  4231 non-null   object
dtypes: int64(2), object(4)
memory usage: 198.5+ KB


# data preprocessing

In [215]:
train = train.drop(['ID', 'tanggal', 'nama file gambar', 'judul'], axis = 1)

In [216]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4231 entries, 0 to 4230
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   4231 non-null   int64 
 1   narasi  4231 non-null   object
dtypes: int64(1), object(1)
memory usage: 66.2+ KB


In [217]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470 entries, 0 to 469
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ID                470 non-null    int64 
 1   tanggal           470 non-null    object
 2   judul             470 non-null    object
 3   narasi            470 non-null    object
 4   nama file gambar  470 non-null    object
dtypes: int64(1), object(4)
memory usage: 18.5+ KB


In [218]:
test = test.drop(['ID', 'tanggal', 'nama file gambar', 'judul'], axis = 1)

In [219]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470 entries, 0 to 469
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   narasi  470 non-null    object
dtypes: object(1)
memory usage: 3.8+ KB


## punct stopwords symbols lowercase stemming (data cleaning)

In [None]:
train.head(4)

Unnamed: 0,label,narasi
0,1,A caller to a radio talk show recently shared ...
1,1,Yth.Seluruh Anggota Grup Sesuai Instruksi Gube...
2,1,Jokowi adalah presiden terbaik dlm sejarah ban...
3,1,"Maaf Mas2 dan Mbak2, ini bukan politik, tapi k..."


In [None]:
# Tokenize

from nltk import word_tokenize

train["narasi"] = train["narasi"].apply(word_tokenize)
train.head(4)

Unnamed: 0,label,narasi
0,1,"[A, caller, to, a, radio, talk, show, recently..."
1,1,"[Yth.Seluruh, Anggota, Grup, Sesuai, Instruksi..."
2,1,"[Jokowi, adalah, presiden, terbaik, dlm, sejar..."
3,1,"[Maaf, Mas2, dan, Mbak2, ,, ini, bukan, politi..."


In [222]:
# punctuation -> nltk punkt

import string

def remove_punctuation(tokens):
    return [token for token in tokens if token not in string.punctuation]

train["narasi"] = train["narasi"].apply(remove_punctuation)
train.head(4)


Unnamed: 0,label,narasi
0,1,"[A, caller, to, a, radio, talk, show, recently..."
1,1,"[Yth.Seluruh, Anggota, Grup, Sesuai, Instruksi..."
2,1,"[Jokowi, adalah, presiden, terbaik, dlm, sejar..."
3,1,"[Maaf, Mas2, dan, Mbak2, ini, bukan, politik, ..."


In [223]:
!pip install Sastrawi



In [224]:
# stopwords -> https://rahmadya.com/2019/04/24/stopword-berbahasa-indonesia/
import nltk
from nltk.corpus import stopwords
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# Download NLTK English stopwords (if not already downloaded)
nltk.download('stopwords')

# Load English stopwords
stopwords_english = set(stopwords.words('english'))

# Load Indonesian stopwords from Sastrawi
factory = StopWordRemoverFactory()
stopwords_indonesia = set(factory.get_stop_words())

# Combine both stopword lists
combined_stopwords = stopwords_english.union(stopwords_indonesia)

# Function to remove stopwords
def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in combined_stopwords]

# Apply to DataFrame
train["narasi"] = train["narasi"].apply(remove_stopwords)
train.head(4)  # Check results

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,label,narasi
0,1,"[caller, radio, talk, show, recently, shared, ..."
1,1,"[Yth.Seluruh, Anggota, Grup, Sesuai, Instruksi..."
2,1,"[Jokowi, presiden, terbaik, dlm, sejarah, bang..."
3,1,"[Maaf, Mas2, Mbak2, bukan, politik, kenyataan,..."


In [225]:
# symbols -> isalpha()
def remove_symbols(tokens):
    return [word for word in tokens if word.isalpha()]

train["narasi"] = train["narasi"].apply(remove_symbols)
train.head(4)

Unnamed: 0,label,narasi
0,1,"[caller, radio, talk, show, recently, shared, ..."
1,1,"[Anggota, Grup, Sesuai, Instruksi, Gubernur, J..."
2,1,"[Jokowi, presiden, terbaik, dlm, sejarah, bang..."
3,1,"[Maaf, bukan, politik, kenyataan, Pak, Jokowi,..."


In [226]:
# case folding -> lower()
train["narasi"] = train["narasi"].apply(lambda tokens: [word.lower() for word in tokens])
train.head(4)

Unnamed: 0,label,narasi
0,1,"[caller, radio, talk, show, recently, shared, ..."
1,1,"[anggota, grup, sesuai, instruksi, gubernur, j..."
2,1,"[jokowi, presiden, terbaik, dlm, sejarah, bang..."
3,1,"[maaf, bukan, politik, kenyataan, pak, jokowi,..."


In [227]:
# stemming -> https://pypi.org/project/Sastrawi/
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Initialize the stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()
def stem_tokens(tokens):
    # Join tokens into a sentence
    sentence = ' '.join(tokens)
    # Stem the sentence
    stemmed_sentence = stemmer.stem(sentence)
    # Split back into tokens
    return stemmed_sentence.split()

# Apply to the DataFrame
train["narasi"] = train["narasi"].apply(stem_tokens)
train.head(4)

Unnamed: 0,label,narasi
0,1,"[caller, radio, talk, show, recently, shared, ..."
1,1,"[anggota, grup, sesuai, instruksi, gubernur, j..."
2,1,"[jokowi, presiden, baik, dlm, sejarah, bangsa,..."
3,1,"[maaf, bukan, politik, nyata, pak, jokowi, has..."


In [233]:
train.to_csv('processed_data.csv', index=False)  # No row numbers

In [240]:
#if processed data is lost

train = pd.read_csv('processed_data.csv')


# more data processing

In [241]:
train.head(5)

Unnamed: 0,label,narasi
0,1,"['caller', 'radio', 'talk', 'show', 'recently'..."
1,1,"['anggota', 'grup', 'sesuai', 'instruksi', 'gu..."
2,1,"['jokowi', 'presiden', 'baik', 'dlm', 'sejarah..."
3,1,"['maaf', 'bukan', 'politik', 'nyata', 'pak', '..."
4,1,"['kadrun', 'kalo', 'lihat', 'foto', 'panas', '..."


In [243]:
train["text_processed"] = train["narasi"].apply(lambda x: ' '.join(x))

In [245]:
# learn embedding/word vectorizing + feature extraction, see if one of them isnt needed n stuff karena my paper didnt use "embedding" -> *just learn how to use tf-idf*

from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF
tfidf = TfidfVectorizer(
    max_features=5000,  # Keep top 5000 words (adjust as needed)
    min_df=2,           # Ignore words appearing in <2 docs
    max_df=0.95,        # Ignore words in >95% of docs (remove common words)
    ngram_range=(1, 2)  # Include 1-word and 2-word phrases (e.g., "kota besar")
)

# Fit and transform the text
X_tfidf = tfidf.fit_transform(train["narasi"])



# Convert to DataFrame (optional)
# tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())
# print(tfidf_df.head())

   aamiin  aamiin moga  aamiin yaa  aamiin yra  abadi  abai  abang  abdul  \
0     0.0          0.0         0.0         0.0    0.0   0.0    0.0    0.0   
1     0.0          0.0         0.0         0.0    0.0   0.0    0.0    0.0   
2     0.0          0.0         0.0         0.0    0.0   0.0    0.0    0.0   
3     0.0          0.0         0.0         0.0    0.0   0.0    0.0    0.0   
4     0.0          0.0         0.0         0.0    0.0   0.0    0.0    0.0   

   abdul somad  abdulaziz  ...  zat  zealand  zikir  zina  zionis  \
0          0.0        0.0  ...  0.0      0.0    0.0   0.0     0.0   
1          0.0        0.0  ...  0.0      0.0    0.0   0.0     0.0   
2          0.0        0.0  ...  0.0      0.0    0.0   0.0     0.0   
3          0.0        0.0  ...  0.0      0.0    0.0   0.0     0.0   
4          0.0        0.0  ...  0.0      0.0    0.0   0.0     0.0   

   zionis israel  zon  zona  zona merah  zuckerberg  
0            0.0  0.0   0.0         0.0         0.0  
1            0

In [None]:
# split data for TRAIN
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, train["label"], test_size=0.2, random_state=42
)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 65220 stored elements and shape (4231, 5000)>
  Coords	Values
  (0, 4188)	0.18110264326823733
  (0, 3663)	0.18632314816635256
  (0, 4912)	0.18110264326823733
  (0, 714)	0.2467215486717805
  (0, 795)	0.34646156507093767
  (0, 2426)	0.17323078253546884
  (0, 2481)	0.17683718018553057
  (0, 913)	0.18110264326823733
  (0, 889)	0.35367436037106115
  (0, 2618)	0.5196923476064065
  (0, 792)	0.37264629633270513
  (0, 2496)	0.18110264326823733
  (0, 3659)	0.170106778793739
  (0, 952)	0.170106778793739
  (1, 714)	0.17580537867337556
  (1, 163)	0.17257627429908348
  (1, 1269)	0.20312664834648894
  (1, 4164)	0.18876376246098153
  (1, 1641)	0.23180692360679436
  (1, 1273)	0.17361827642872452
  (1, 1836)	0.17814608374391763
  (1, 4536)	0.1696324791911662
  (1, 1381)	0.15503561792844636
  (1, 3630)	0.19418236369007574
  (1, 4602)	0.18558599197585182
  :	:
  (4230, 2141)	0.10484564671343419
  (4230, 4448)	0.08808045788441254
  (4230, 4521)	

# modelling

https://www.kaggle.com/discussions/questions-and-answers/410405

if you want an easy way out, just compare how multiple models function based on this one dataset

---


also, kan harus pake metrics. bisa pake accuracy recall precision f-1 score yada yada. tapi di kaggle kan ada data_test or whatever, nah itu bisa dipake buat kayak manual testing. kayak use the models to guess if a certain row in the csv itu fake news or not. (if you do this, dont forget to make a function (or smth) to preprocess the single query you put in)

## naive bayes

In [256]:
# Naive_Bayes MultinomialNB
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


from sklearn.metrics import classification_report

# Generate classification report
report = classification_report(y_test, y_pred)

# Print the report
print(report)

              precision    recall  f1-score   support

           0       0.67      0.01      0.03       139
           1       0.84      1.00      0.91       708

    accuracy                           0.84       847
   macro avg       0.75      0.51      0.47       847
weighted avg       0.81      0.84      0.77       847



In [275]:
# Naive_Bayes BernoulliNB
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


from sklearn.metrics import classification_report

# Generate classification report
report = classification_report(y_test, y_pred)

# Print the report
print(report)

              precision    recall  f1-score   support

           0       0.31      0.26      0.28       139
           1       0.86      0.89      0.87       708

    accuracy                           0.79       847
   macro avg       0.59      0.57      0.58       847
weighted avg       0.77      0.79      0.78       847



In [None]:
# === Saving Phase (run once) ===
import joblib
joblib.dump(model, 'bernoulli_nb_model.joblib')

# Save TF-IDF vectorizer
joblib.dump(tfidf, 'tfidf_vectorizer.joblib')

# Save preprocessing tools
factory = StemmerFactory()
stemmer = factory.create_stemmer()
joblib.dump(stemmer, 'stemmer.joblib')
joblib.dump(combined_stopwords, 'stopwords.joblib')



['stopwords.joblib']

In [258]:
# Naive_Bayes GaussianNB
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train.toarray(), y_train)
y_pred = model.predict(X_test.toarray())


from sklearn.metrics import classification_report

# Generate classification report
report = classification_report(y_test, y_pred)

# Print the report
print(report)

              precision    recall  f1-score   support

           0       0.22      0.27      0.25       139
           1       0.85      0.81      0.83       708

    accuracy                           0.72       847
   macro avg       0.54      0.54      0.54       847
weighted avg       0.75      0.72      0.73       847



## svm

In [254]:
# SVM
from sklearn.svm import SVC
from sklearn.metrics import classification_report

svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))

SVM Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.04      0.07       139
           1       0.84      1.00      0.91       708

    accuracy                           0.84       847
   macro avg       0.84      0.52      0.49       847
weighted avg       0.84      0.84      0.77       847



## random forest

In [259]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.09      0.15       139
           1       0.85      0.99      0.91       708

    accuracy                           0.84       847
   macro avg       0.70      0.54      0.53       847
weighted avg       0.80      0.84      0.79       847



In [262]:
train['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,3465
0,766
