In [1]:
import json
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
stop = stopwords.words('indonesian')

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
file_path_test = "D:/ResearchMetodology/SNLI_Indo_test.jsonl"
file_path_train = "D:\ResearchMetodology\SNLI_Indo_train.jsonl"
file_path_val = "D:\ResearchMetodology\SNLI_Indo_val.jsonl"

In [3]:
data_list_train = []

with open(file_path_train, "r") as f:
    for line in f:
        try:
            data_train = json.loads(line)
            data_list_train.append(data_train)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")

# Create a DataFrame from the list of dictionaries
df_train = pd.DataFrame(data_list_train[:5000])
# df_train = pd.DataFrame(df_train[:10000])

# Display the DataFrame
print(df_train)


       annotator_labels    label emas  \
0            ['netral']        netral   
1       ['kontradiksi']   kontradiksi   
2      ['keterlibatan']  keterlibatan   
3            ['netral']        netral   
4      ['keterlibatan']  keterlibatan   
...                 ...           ...   
4995    ['kontradiksi']   kontradiksi   
4996    ['kontradiksi']   kontradiksi   
4997   ['keterlibatan']  keterlibatan   
4998         ['netral']        netral   
4999         ['netral']        netral   

                                               kalimat1  \
0     Seseorang di atas kuda melompati pesawat yang ...   
1     Seseorang di atas kuda melompati pesawat yang ...   
2     Seseorang di atas kuda melompati pesawat yang ...   
3            Anak-anak tersenyum dan melambai ke kamera   
4            Anak-anak tersenyum dan melambai ke kamera   
...                                                 ...   
4995     Seseorang mengendarai sepeda motor ke samping.   
4996     Seseorang mengendarai sepe

In [4]:
# drop annotator_labels
df_train = df_train.drop(columns=["annotator_labels"])
df_train.head()

Unnamed: 0,label emas,kalimat1,kalimat2
0,netral,Seseorang di atas kuda melompati pesawat yang ...,Seseorang sedang melatih kudanya untuk sebuah ...
1,kontradiksi,Seseorang di atas kuda melompati pesawat yang ...,"Seseorang sedang makan malam, memesan telur da..."
2,keterlibatan,Seseorang di atas kuda melompati pesawat yang ...,"Seseorang berada di luar ruangan, di atas kuda."
3,netral,Anak-anak tersenyum dan melambai ke kamera,Mereka tersenyum pada orang tua mereka
4,keterlibatan,Anak-anak tersenyum dan melambai ke kamera,Ada anak-anak yang hadir


In [5]:
df_train['kalimat1'] = df_train['kalimat1'].str.replace('[^\w\s]','')
df_train['kalimat2'] = df_train['kalimat2'].str.replace('[^\w\s]','')
df_train.head()

  df_train['kalimat1'] = df_train['kalimat1'].str.replace('[^\w\s]','')
  df_train['kalimat2'] = df_train['kalimat2'].str.replace('[^\w\s]','')


Unnamed: 0,label emas,kalimat1,kalimat2
0,netral,Seseorang di atas kuda melompati pesawat yang ...,Seseorang sedang melatih kudanya untuk sebuah ...
1,kontradiksi,Seseorang di atas kuda melompati pesawat yang ...,Seseorang sedang makan malam memesan telur dadar
2,keterlibatan,Seseorang di atas kuda melompati pesawat yang ...,Seseorang berada di luar ruangan di atas kuda
3,netral,Anakanak tersenyum dan melambai ke kamera,Mereka tersenyum pada orang tua mereka
4,keterlibatan,Anakanak tersenyum dan melambai ke kamera,Ada anakanak yang hadir


In [6]:
df_train['kalimat1'] = df_train['kalimat1'].apply(lambda x: x.split())
df_train['kalimat2'] = df_train['kalimat2'].apply(lambda x: x.split())
df_train.head()

Unnamed: 0,label emas,kalimat1,kalimat2
0,netral,"[Seseorang, di, atas, kuda, melompati, pesawat...","[Seseorang, sedang, melatih, kudanya, untuk, s..."
1,kontradiksi,"[Seseorang, di, atas, kuda, melompati, pesawat...","[Seseorang, sedang, makan, malam, memesan, tel..."
2,keterlibatan,"[Seseorang, di, atas, kuda, melompati, pesawat...","[Seseorang, berada, di, luar, ruangan, di, ata..."
3,netral,"[Anakanak, tersenyum, dan, melambai, ke, kamera]","[Mereka, tersenyum, pada, orang, tua, mereka]"
4,keterlibatan,"[Anakanak, tersenyum, dan, melambai, ke, kamera]","[Ada, anakanak, yang, hadir]"


In [7]:
df_train['kalimat1'] = df_train['kalimat1'].apply(lambda x: [item for item in x if item not in stop])
df_train['kalimat2'] = df_train['kalimat2'].apply(lambda x: [item for item in x if item not in stop])
df_train.head()

Unnamed: 0,label emas,kalimat1,kalimat2
0,netral,"[Seseorang, kuda, melompati, pesawat, rusak]","[Seseorang, melatih, kudanya, kompetisi]"
1,kontradiksi,"[Seseorang, kuda, melompati, pesawat, rusak]","[Seseorang, makan, malam, memesan, telur, dadar]"
2,keterlibatan,"[Seseorang, kuda, melompati, pesawat, rusak]","[Seseorang, ruangan, kuda]"
3,netral,"[Anakanak, tersenyum, melambai, kamera]","[Mereka, tersenyum, orang, tua]"
4,keterlibatan,"[Anakanak, tersenyum, melambai, kamera]","[Ada, anakanak, hadir]"


In [8]:
df_train['kalimat1'] = df_train['kalimat1'].apply(lambda x: [stemmer.stem(item) for item in x])
df_train['kalimat2'] = df_train['kalimat2'].apply(lambda x: [stemmer.stem(item) for item in x])
df_train.head()

Unnamed: 0,label emas,kalimat1,kalimat2
0,netral,"[orang, kuda, lompat, pesawat, rusak]","[orang, latih, kuda, kompetisi]"
1,kontradiksi,"[orang, kuda, lompat, pesawat, rusak]","[orang, makan, malam, mes, telur, dadar]"
2,keterlibatan,"[orang, kuda, lompat, pesawat, rusak]","[orang, ruang, kuda]"
3,netral,"[anakanak, senyum, lambai, kamera]","[mereka, senyum, orang, tua]"
4,keterlibatan,"[anakanak, senyum, lambai, kamera]","[ada, anakanak, hadir]"


In [9]:
from gensim.models import FastText

# Membuat model FastText
model_ft = FastText(df_train['kalimat1'] + df_train['kalimat2'], vector_size=100, window=5, min_count=1, workers=4)

# Mendapatkan vektor untuk setiap kalimat
def get_sentence_vector(sentence):
    words = sentence
    vectors = [model_ft.wv[word] for word in words if word in model_ft.wv.key_to_index]
    return np.mean(vectors, axis=0)

In [10]:
X = df_train['kalimat1'] + df_train['kalimat2']
y = df_train['label emas']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train_ft = np.array([get_sentence_vector(sentence) for sentence in X_train])
X_test_ft = np.array([get_sentence_vector(sentence) for sentence in X_test])

In [11]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

In [12]:
# import logistic regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train_ft, y_train)

# Evaluasi model
accuracy = clf.score(X_test_ft, y_test)
print(f'Accuracy Logistic Regression: {accuracy}')

Accuracy Logistic Regression: 0.388


In [13]:
# import knn
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_ft, y_train)

# Evaluasi model
accuracy = knn.score(X_test_ft, y_test)
print(f'Accuracy KNN: {accuracy}')

[WinError 2] The system cannot find the file specified
  File "c:\Users\ASUS\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\Users\ASUS\AppData\Local\Programs\Python\Python39\lib\subprocess.py", line 501, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\ASUS\AppData\Local\Programs\Python\Python39\lib\subprocess.py", line 947, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\ASUS\AppData\Local\Programs\Python\Python39\lib\subprocess.py", line 1416, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


Accuracy KNN: 0.33


In [14]:
# import nb
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train_ft, y_train)

# Evaluasi model
accuracy = nb.score(X_test_ft, y_test)
print(f'Accuracy Naive Bayes: {accuracy}')


Accuracy Naive Bayes: 0.393


In [15]:
# import rf
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train_ft, y_train)

# Evaluasi model
accuracy = rf.score(X_test_ft, y_test)
print(f'Accuracy Random Forest: {accuracy}')

Accuracy Random Forest: 0.336


In [16]:
# import qda
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train_ft, y_train)

# Evaluasi model
accuracy = qda.score(X_test_ft, y_test)
print(f'Accuracy QDA: {accuracy}')


Accuracy QDA: 0.258


In [17]:
# import lda
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()
lda.fit(X_train_ft, y_train)

# Evaluasi model
accuracy = lda.score(X_test_ft, y_test)
print(f'Accuracy LDA: {accuracy}')

Accuracy LDA: 0.394


In [18]:
# import dummy
from sklearn.dummy import DummyClassifier

dummy = DummyClassifier()
dummy.fit(X_train_ft, y_train)

# Evaluasi model
accuracy = dummy.score(X_test_ft, y_test)
print(f'Accuracy Dummy: {accuracy}')

Accuracy Dummy: 0.316


In [19]:
# import svm
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_ft, y_train)

# Evaluasi model
accuracy = svm.score(X_test_ft, y_test)
print(f'Accuracy SVM: {accuracy}')

Accuracy SVM: 0.399


In [20]:
# import ada
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier()
ada.fit(X_train_ft, y_train)

# Evaluasi model
accuracy = ada.score(X_test_ft, y_test)
print(f'Accuracy Ada: {accuracy}')



Accuracy Ada: 0.365


In [21]:
# import et
from sklearn.ensemble import ExtraTreesClassifier

et = ExtraTreesClassifier()
et.fit(X_train_ft, y_train)

# Evaluasi model
accuracy = et.score(X_test_ft, y_test)
print(f'Accuracy ET: {accuracy}')

Accuracy ET: 0.317


In [22]:
# import gbc
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()
gbc.fit(X_train_ft, y_train)

# Evaluasi model
accuracy = gbc.score(X_test_ft, y_test)
print(f'Accuracy GBC: {accuracy}')

Accuracy GBC: 0.362


In [23]:
# import lightgbm
import lightgbm as lgb

lgb = lgb.LGBMClassifier()
lgb.fit(X_train_ft, y_train)

# Evaluasi model
accuracy = lgb.score(X_test_ft, y_test)
print(f'Accuracy LGB: {accuracy}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035741 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 100
[LightGBM] [Info] Start training from score -1.096614
[LightGBM] [Info] Start training from score -1.107149
[LightGBM] [Info] Start training from score -1.092133
Accuracy LGB: 0.337


In [24]:
# import dt
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train_ft, y_train)

# Evaluasi model
accuracy = dt.score(X_test_ft, y_test)
print(f'Accuracy DT: {accuracy}')

Accuracy DT: 0.319


In [25]:
# import ridge
from sklearn.linear_model import RidgeClassifier

ridge = RidgeClassifier()
ridge.fit(X_train_ft, y_train)

# Evaluasi model
accuracy = ridge.score(X_test_ft, y_test)
print(f'Accuracy Ridge: {accuracy}')

Accuracy Ridge: 0.38
