In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
from sklearn import model_selection, preprocessing, utils

from pandas.plotting import scatter_matrix

In [32]:
# Membaca file CSV ke dalam DataFrame
df = pd.read_csv('Dataset2.csv', encoding='utf-8')

# Menggabungkan isi dari beberapa kolom menjadi satu teks
df['teks'] = df[['satu', 'dua', 'tiga', 'empat', 'lima']].astype(str).apply(lambda x: ' '.join(x), axis=1)

# Membuat objek CountVectorizer dan TfidfTransformer
cv = CountVectorizer()
tfidf = TfidfTransformer(sublinear_tf=True, use_idf=True, smooth_idf=True)

In [34]:
#tf-idf
# tfidf = TfidfTransformer(sublinear_tf=True, use_idf=True, smooth_idf=True)
X = tfidf.fit_transform(cv.fit_transform(df["teks"])).toarray()
y = df['label'].values
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size= 0.1, random_state= 0)
print('X_train: ', X_train.shape)
print('X_test: ', X_test.shape)
print('Y_train: ', Y_train.shape)
print('Y_test: ', Y_test.shape)

X_train:  (2700, 6236)
X_test:  (300, 6236)
Y_train:  (2700,)
Y_test:  (300,)


In [None]:
#Import svm model
from sklearn import svm

svm = SVC(decision_function_shape='ovo')
hyperparameters = {'kernel':['rbf','linear','poly','sigmoid'], 'C':[0.5,0.75,1,10], 'gamma':[0.001,0.01,0.5,1,'scale','auto']}
svm_tuned = GridSearchCV(svm,hyperparameters,cv=10)
svm_tuned.fit(X_train, Y_train)

#Train the model using the training sets
svm.fit(X_train, Y_train)

#mendapatkan nilai bias
bias = svm.intercept_

#Predict the response for test dataset
y_pred = svm.predict(X_test)

#Evaluasi Hasil Prediksi
print(accuracy_score(Y_test, y_pred))
print(confusion_matrix(Y_test, y_pred))
print(classification_report(Y_test, y_pred))
print("Nilai b:", bias)
print(" ")
print('Kernel Using  :',svm_tuned.best_estimator_.kernel)
print('Best C       :',svm_tuned.best_estimator_.C)
print('Best Gamma   :',svm_tuned.best_estimator_.gamma)
print('Best Score   :',svm_tuned.best_score_)

In [35]:
# Import svm model
from sklearn import svm

# Definisikan hyperparameter space untuk RandomizedSearchCV
param_dist = {'kernel': ['rbf', 'linear', 'poly', 'sigmoid'], 'C': np.linspace(0.0001, 100, 1000), 'gamma': np.linspace(0.0001, 100, 1000)}
# np.linspace(0.0001, 100, 1000) untuk membuat array dengan 1000 nilai yang berada dalam rentang dari 0.0001 hingga 100

# Membuat objek model SVM
svm = SVC(decision_function_shape='ovo')

# Buat objek Randomized Search dengan model SVM dan hyperparameter space
random_search = RandomizedSearchCV(svm, param_distributions=param_dist, n_iter=10, cv=10)


# Latih model dengan Randomized Search untuk mencari hyperparameter terbaik
random_search.fit(X_train, Y_train)

# Mendapatkan model terbaik yang telah dilatih dengan hyperparameter terbaik
best_svm_model = random_search.best_estimator_

# Predict the response for test dataset using the best model
y_pred = best_svm_model.predict(X_test)

#mendapatkan nilai bias
bias = best_svm_model.intercept_

# print nilai bias model
print("Nilai b:", bias)
print(" ")

# Evaluasi Hasil Prediksi
print("Accuracy:", accuracy_score(Y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(Y_test, y_pred))
print("Classification Report:\n", classification_report(Y_test, y_pred))
print(" ")

# Print the best hyperparameters
print('Best Kernel:', random_search.best_estimator_.kernel)
print('Best C:', random_search.best_estimator_.C)
print('Best Gamma:', random_search.best_estimator_.gamma)
print('Best Score:', random_search.best_score_)

Nilai b: [-0.164045   -0.12630359  0.03283582]
 
Accuracy: 0.82
Confusion Matrix:
 [[80 30  0]
 [ 0 95  0]
 [ 0 24 71]]
Classification Report:
               precision    recall  f1-score   support

           1       1.00      0.73      0.84       110
           2       0.64      1.00      0.78        95
           3       1.00      0.75      0.86        95

    accuracy                           0.82       300
   macro avg       0.88      0.82      0.83       300
weighted avg       0.89      0.82      0.83       300

 
Best Kernel: rbf
Best C: 94.5946
Best Gamma: 69.0691
Best Score: 0.7837037037037037


In [36]:
df_result = pd.DataFrame({'Actual': Y_test, 'Predicted': y_pred})
df_result.to_csv('hasil_prediksi.csv', index=False)