In [2]:
!pip install ktrain

Collecting ktrain
  Downloading ktrain-0.41.3.tar.gz (25.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.3/25.3 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting langdetect (from ktrain)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m73.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting syntok>1.3.3 (from ktrain)
  Downloading syntok-1.4.4-py3-none-any.whl (24 kB)
Collecting tika (from ktrain)
  Downloading tika-2.6.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting keras_bert>=0.86.0 (from ktrain)
  Downloading keras-bert-0.89.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting whoosh (from ktrain)
  Downloading Whoosh-2.7.4-py2.py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from google.colab import drive
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import ktrain
import joblib
import torch
from sklearn.model_selection import train_test_split
from ktrain import text
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score
from sklearn.utils import resample
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding,BertTokenizer, BertForSequenceClassification

In [3]:
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/SatriaData/dataset_prediksi_cleaned.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
df = pd.read_csv(file_path)
df

Unnamed: 0.1,Unnamed: 0,IDText,text
0,0,TXT0001,lu mau org2 prodemokrasi negara punya sempat b...
1,1,TXT0002,prabowo tanya soal hutang luar negeri jawab hu...
2,2,TXT0003,kiki daliyo ganjar pranowo beliau sosok mengag...
3,3,TXT0004,prabowo gibran laku semua sejahtera rakyat
4,4,TXT0005,justru nyambung junjung elu aomkmkmkmk ngomong...
...,...,...,...
995,995,TXT0996,bikin bangga deh ganjarmahfud mau alokasi teng...
996,996,TXT0997,pak jokowi pilpres 2024 besar hati rangkul pak...
997,997,TXT0998,sbaiknya got nga usah ikut debat dehnga jelas ...
998,998,TXT0999,biasa rembuk musyawarah gaya pimpin ganjar sej...


In [5]:
df_pred = df.drop(columns=['Unnamed: 0'])
df_pred.isna().sum()

IDText    0
text      0
dtype: int64

In [6]:
df_pred

Unnamed: 0,IDText,text
0,TXT0001,lu mau org2 prodemokrasi negara punya sempat b...
1,TXT0002,prabowo tanya soal hutang luar negeri jawab hu...
2,TXT0003,kiki daliyo ganjar pranowo beliau sosok mengag...
3,TXT0004,prabowo gibran laku semua sejahtera rakyat
4,TXT0005,justru nyambung junjung elu aomkmkmkmk ngomong...
...,...,...
995,TXT0996,bikin bangga deh ganjarmahfud mau alokasi teng...
996,TXT0997,pak jokowi pilpres 2024 besar hati rangkul pak...
997,TXT0998,sbaiknya got nga usah ikut debat dehnga jelas ...
998,TXT0999,biasa rembuk musyawarah gaya pimpin ganjar sej...


In [7]:
NBM_loaded = joblib.load('/content/drive/My Drive/SatriaData/NB_Multinomial_model.pkl')
knn_loaded = joblib.load('/content/drive/My Drive/SatriaData/knn_model.pkl')
vectorizer = joblib.load('/content/drive/My Drive/SatriaData/tfidf_vectorizer.pkl')

output_dir = '/content/drive/My Drive/SatriaData/IndoBERT_Model_Text_Classification'

tokenizer = BertTokenizer.from_pretrained(output_dir)
model = BertForSequenceClassification.from_pretrained(output_dir)

In [8]:
X_pred = vectorizer.transform(df_pred['text'])

df_pred['NB_Multinomial'] = NBM_loaded.predict(X_pred)
df_pred['KNN'] = knn_loaded.predict(X_pred)

inputs = tokenizer(df_pred['text'].tolist(), return_tensors="pt", padding=True, truncation=True, max_length=128)

with torch.no_grad():
    outputs = model(**inputs)
    indobert_predictions = torch.argmax(outputs.logits, dim=-1).numpy()
df_pred['IndoBERT'] = indobert_predictions
print(df_pred)

      IDText                                               text  \
0    TXT0001  lu mau org2 prodemokrasi negara punya sempat b...   
1    TXT0002  prabowo tanya soal hutang luar negeri jawab hu...   
2    TXT0003  kiki daliyo ganjar pranowo beliau sosok mengag...   
3    TXT0004         prabowo gibran laku semua sejahtera rakyat   
4    TXT0005  justru nyambung junjung elu aomkmkmkmk ngomong...   
..       ...                                                ...   
995  TXT0996  bikin bangga deh ganjarmahfud mau alokasi teng...   
996  TXT0997  pak jokowi pilpres 2024 besar hati rangkul pak...   
997  TXT0998  sbaiknya got nga usah ikut debat dehnga jelas ...   
998  TXT0999  biasa rembuk musyawarah gaya pimpin ganjar sej...   
999  TXT1000  mirage tolak juwono beli prabowo jubir timnas ...   

     NB_Multinomial  KNN  IndoBERT  
0                 1    1         1  
1                 1    1         1  
2                 4    4         4  
3                 1    1         1  
4         

In [10]:
df_pred['NB_Multinomial'] = df_pred['NB_Multinomial'].astype(int)
df_pred['KNN'] = df_pred['KNN'].astype(int)
df_pred['IndoBERT'] = df_pred['IndoBERT'].astype(int)

identical_predictions = df_pred[(df_pred['NB_Multinomial'] == df_pred['KNN']) &
                                      (df_pred['KNN'] == df_pred['IndoBERT'])]

print(f"Number of identical predictions across all models: {len(identical_predictions)}")

Number of identical predictions across all models: 655


In [11]:
different_predictions = df_pred[(df_pred['NB_Multinomial'] != df_pred['KNN']) |
                                      (df_pred['KNN'] != df_pred['IndoBERT']) |
                                      (df_pred['NB_Multinomial'] != df_pred['IndoBERT'])]

print(f"Number of different predictions across models: {len(different_predictions)}")

Number of different predictions across models: 345


In [12]:
df_pred.to_csv('/content/drive/My Drive/SatriaData/pred_final.csv')
