In [78]:
import pandas as pd

In [79]:
df = pd.read_csv("data.csv")

In [80]:
df = df[['summary','label']]

In [81]:
df.head()

Unnamed: 0,summary,label
0,Anggota Komisi VII DPR RI Rofik Hananto menyay...,1
1,Presiden Joko Widodo telah memerintahkan Wakil...,0
2,Wakil Ketua MPR RI Dr. H. M. Hidayat Nur Wahid...,0
3,Tim Kedokteran dan Kesehatan (Dokkes) Polri te...,0
4,Ketua MPR RI Bambang Soesatyo telah diangkat s...,0


In [82]:
df = df.sample(n=8000, replace=True)
df.shape

(8000, 2)

In [83]:
df.isnull().sum()

summary    5
label      0
dtype: int64

In [84]:
df.dropna(inplace=True)

In [85]:
import re
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

def clean_summary(summary):
    # Convert text to lowercase
    summary = summary.lower()
    
    # Remove special characters and digits
    summary = re.sub(r'[^a-zA-Z\s]', '', summary)
    
    # Remove links
    summary = re.sub(r'http\S+', '', summary)
    
    # Tokenize the text
    words = word_tokenize(summary)
    
    # Remove stopwords
    stop_words = set(stopwords.words('indonesian'))
    filtered_words = [word for word in words if word not in stop_words]
    
    # Initialize Porter Stemmer
    stemmer = PorterStemmer()
    
    # Perform stemming
    stemmed_words = [stemmer.stem(word) for word in filtered_words]
    
    # Join the stemmed words back into a single string
    cleaned_summary = ' '.join(stemmed_words)
    
    return cleaned_summary

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fikri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [86]:
df['clean_summary'] = df['summary'].apply(lambda x: clean_summary(x))

In [87]:
df['label'].value_counts()

0    4147
1    3848
Name: label, dtype: int64

In [88]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['clean_summary'], df['label'], test_size=0.2, random_state=42)

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [89]:
# Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_tfidf, y_train)

# Predictions
y_pred = rf_classifier.predict(X_test_tfidf)

# Confusion matrix and classification report
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[614 217]
 [309 459]]

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.74      0.70       831
           1       0.68      0.60      0.64       768

    accuracy                           0.67      1599
   macro avg       0.67      0.67      0.67      1599
weighted avg       0.67      0.67      0.67      1599



In [90]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression classifier
lr_classifier = LogisticRegression(random_state=42)
lr_classifier.fit(X_train_tfidf, y_train)

# Predictions
y_pred_lr = lr_classifier.predict(X_test_tfidf)

# Confusion matrix and classification report for logistic regression
conf_matrix_lr = confusion_matrix(y_test, y_pred_lr)
class_report_lr = classification_report(y_test, y_pred_lr)

print("Logistic Regression - Confusion Matrix:")
print(conf_matrix_lr)
print("\nLogistic Regression - Classification Report:")
print(class_report_lr)


Logistic Regression - Confusion Matrix:
[[570 261]
 [350 418]]

Logistic Regression - Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.69      0.65       831
           1       0.62      0.54      0.58       768

    accuracy                           0.62      1599
   macro avg       0.62      0.62      0.61      1599
weighted avg       0.62      0.62      0.62      1599



In [91]:
from sklearn.svm import SVC

# SVM classifier
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train_tfidf, y_train)

# Predictions
y_pred_svm = svm_classifier.predict(X_test_tfidf)

# Confusion matrix and classification report for SVM
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)
class_report_svm = classification_report(y_test, y_pred_svm)

print("SVM - Confusion Matrix:")
print(conf_matrix_svm)
print("\nSVM - Classification Report:")
print(class_report_svm)

SVM - Confusion Matrix:
[[549 282]
 [327 441]]

SVM - Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.66      0.64       831
           1       0.61      0.57      0.59       768

    accuracy                           0.62      1599
   macro avg       0.62      0.62      0.62      1599
weighted avg       0.62      0.62      0.62      1599



In [92]:
def predict_fake_or_real(summary):
    # Clean the input text
    cleaned_summary = clean_summary(summary)
    
    # Transform the cleaned text using the TF-IDF vectorizer
    text_tfidf = tfidf_vectorizer.transform([cleaned_summary])
    
    # Use the trained classifier to predict
    prediction = svm_classifier.predict(text_tfidf)
    
    # Map prediction to label
    if(prediction[0] == 1):
        label = 'fake'
    
    elif(prediction[0] == 0):
        label = 'rill'

    
    return label

# Example usage:
input_text = 'Tim Kedokteran dan Kesehatan (Dokkes) Polri telah menerima 14 kantong jenazah korban kebakaran Depo Plumpang, Jakarta Utara. Tim Dokkes Polri mengirimkan korban luka ke beberapa rumah sakit terdekat guna mendapatkan perawatan lebih lanjut. Tim Labfor dan Inafis'
prediction = predict_fake_or_real(input_text)
print("Prediction:", prediction)

Prediction: rill


In [93]:
# Example usage:
input_text = 'Kim jong un jadi presiden Indonesia'
prediction = predict_fake_or_real(input_text)
print("Prediction:", prediction)    

Prediction: fake
