In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import nltk
os.makedirs(r'C:\\nltk_data', exist_ok=True)
nltk.data.path.append(r'C:\\nltk_data')  
nltk.download('punkt_tab', download_dir=r'C:\\nltk_data')
nltk.download('stopwords', download_dir=r'C:\\nltk_data')
nltk.download('wordnet', download_dir=r'C:\\nltk_data')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn.functional as F
from tqdm import tqdm

In [None]:
import pandas as pd

def read_database_result_csv(file_path):
    try:
        # Mencoba membaca dengan encoding default (utf-8) dan delimiter koma
        df = pd.read_csv(file_path, sep=';', encoding='utf-8')
    except UnicodeDecodeError as e1:
        try:
            # Jika gagal, mencoba dengan encoding latin1
            df = pd.read_csv(file_path, sep=';', encoding='latin1')
        except UnicodeDecodeError as e2:
            try:
                # Jika gagal, mencoba dengan encoding cp1252
                df = pd.read_csv(file_path, sep=';', encoding='cp1252')
            except Exception as e3:
                print(f"Gagal membaca file {file_path}: {e1} | {e2} | {e3}")
                return pd.DataFrame()  # Mengembalikan DataFrame kosong jika gagal
    return df

# Path ke file DATABASE_RESULT_OJK.csv
file_path = 'data/DATABASE_RESULT_OJK.csv'
database_result_df = read_database_result_csv(file_path)

print(database_result_df)


                                ID BIDANG SATKER (AKRONIM)  JENIS SURVEI  \
0              DATABASE INTERNAL_1     MS             DPEA      INTERNAL   
1              DATABASE INTERNAL_6     MS             DPEA      INTERNAL   
2             DATABASE INTERNAL_10     MS             DPEA      INTERNAL   
3             DATABASE INTERNAL_12     MS             DPEA      INTERNAL   
4             DATABASE INTERNAL_14     MS             DPEA      INTERNAL   
...                            ...    ...              ...           ...   
1526  DATABASE TELESURVEY DLIK_267   PEPK             DLIK  NON INTERNAL   
1527  DATABASE TELESURVEY DLIK_269   PEPK             DLIK  NON INTERNAL   
1528  DATABASE TELESURVEY DLIK_270   PEPK             DLIK  NON INTERNAL   
1529  DATABASE TELESURVEY DLIK_276   PEPK             DLIK  NON INTERNAL   
1530  DATABASE TELESURVEY DLIK_277   PEPK             DLIK  NON INTERNAL   

                                     TIPE QUESTION  \
0                                

In [None]:
#FILTERING SATKER

# Menentukan filter SATKER yang diinginkan
filter_option = input("Apakah Anda ingin memilih SATKER berdasarkan pilihan tertentu (y/n)? ")

if filter_option.lower() == 'y':
    # Jika pengguna memilih untuk memasukkan SATKER
    selected_satker = input("Masukkan SATKER yang ingin difilter (misal: 'DPEA'): ")
    
    # Memfilter data berdasarkan SATKER yang dipilih
    filtered_df = database_result_df[database_result_df['SATKER (AKRONIM)'] == selected_satker]
    print(f"Data difilter berdasarkan SATKER: {selected_satker}")
    
elif filter_option.lower() == 'n':
    # Jika pengguna memilih untuk memfilter berdasarkan beberapa SATKER tertentu
    selected_satker_list = ['DPEA', 'DOSB', 'DPSI']  # Contoh daftar SATKER yang bisa dipilih
    filtered_df = database_result_df[database_result_df['SATKER (AKRONIM)'].isin(selected_satker_list)]
    print(f"Data difilter berdasarkan beberapa SATKER: {', '.join(selected_satker_list)}")
    
else:
    # Jika input selain 'y' atau 'n', tampilkan pesan
    print("Pilihan tidak valid. Menggunakan data tanpa filter.")
    filtered_df = database_result_df

# Menampilkan data yang sudah difilter
print(filtered_df)

Data difilter berdasarkan SATKER: DPEA
                     ID BIDANG SATKER (AKRONIM) JENIS SURVEI  TIPE QUESTION  \
0   DATABASE INTERNAL_1     MS             DPEA     INTERNAL    DIRECT DPEA   
1   DATABASE INTERNAL_6     MS             DPEA     INTERNAL    DIRECT DPEA   
2  DATABASE INTERNAL_10     MS             DPEA     INTERNAL    DIRECT DPEA   
3  DATABASE INTERNAL_12     MS             DPEA     INTERNAL    DIRECT DPEA   
4  DATABASE INTERNAL_14     MS             DPEA     INTERNAL    DIRECT DPEA   
5  DATABASE INTERNAL_20     MS             DPEA     INTERNAL  INDIRECT DPEA   

  INSTITUSI / PERSEORANGAN/ASAL SATKER  SENT       RESPOND  \
0                                 DIMB  SENT  SUDAH DI ISI   
1                                 DPLK  SENT  SUDAH DI ISI   
2                                 DINP  SENT  SUDAH DI ISI   
3                                 DPDS  SENT  SUDAH DI ISI   
4                                 DPSU  SENT  SUDAH DI ISI   
5                       Bank Indone

In [None]:
print(database_result_df.isnull().sum())

In [81]:
columns_order = [
    'ID', 'BIDANG', 'SATKER (AKRONIM)', 'JENIS SURVEI', 'TIPE QUESTION', 
    'INSTITUSI / PERSEORANGAN/ASAL SATKER', 'SENT', 'RESPOND', 
    'LINK SURVEYMONKEY', 'TOKEN', 'NAMA PIC/RESPONDEN', 
    'JABATAN/PROFESI/LVEL DI OJK', 'EMAIL', 'KONTAK', 'EMAIL CADANGAN', 
    'KOTAK CADANGAN', 'FUNGSI YANG DINILAI', 'DIRECT / INDIRECT', 
    'JENIS STAKEHOLDERS', 'RELASI RESPONDEN DENGAN SATKER', 'POWER', 
    'INTEREST', 'KATEGORI', 'Dataset', 'CALL FROM OTHER ID', 'DATE', 
    'RESOURCE PERCEPTION', 'PERFORMANCE DELIVERY', 'OUTCOME SATISFACTION', 
    'OPEN QUESTION 1', 'OPEN QUESTION 2'
]

In [82]:
all_data_df = database_result_df[columns_order]

In [84]:
print(all_data_df)

                                ID BIDANG SATKER (AKRONIM)  JENIS SURVEI  \
0              DATABASE INTERNAL_1     MS             DPEA      INTERNAL   
1              DATABASE INTERNAL_6     MS             DPEA      INTERNAL   
2             DATABASE INTERNAL_10     MS             DPEA      INTERNAL   
3             DATABASE INTERNAL_12     MS             DPEA      INTERNAL   
4             DATABASE INTERNAL_14     MS             DPEA      INTERNAL   
...                            ...    ...              ...           ...   
1526  DATABASE TELESURVEY DLIK_267   PEPK             DLIK  NON INTERNAL   
1527  DATABASE TELESURVEY DLIK_269   PEPK             DLIK  NON INTERNAL   
1528  DATABASE TELESURVEY DLIK_270   PEPK             DLIK  NON INTERNAL   
1529  DATABASE TELESURVEY DLIK_276   PEPK             DLIK  NON INTERNAL   
1530  DATABASE TELESURVEY DLIK_277   PEPK             DLIK  NON INTERNAL   

                                     TIPE QUESTION  \
0                                

In [None]:
print(all_data_df.isnull().sum())

In [83]:
all_data_df.describe()

Unnamed: 0,DATE
count,1531.0
mean,47318.161986
std,33106.771546
min,45617.0
25%,45621.0
50%,45622.0
75%,45628.0
max,693961.0


In [85]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+|\@\w+|\#|\d+|[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    stop_words = set(stopwords.words('indonesian'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    cleaned_text = ' '.join(words)
    
    return cleaned_text

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F
import pandas as pd
from tqdm import tqdm

# Ganti model_name dengan model IndoBERT
model_name = "indobenchmark/indobert-base-p1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=6  # Sesuaikan jumlah label
)

label_map = {
    0: "sangat tidak setuju",
    1: "tidak setuju",
    2: "kurang setuju",
    3: "cukup setuju",
    4: "setuju",
    5: "sangat setuju"
}

# Fungsi untuk memproses teks
def preprocess_text(text):
    return text.strip().lower() if isinstance(text, str) else ""  # Periksa apakah teks valid

# Fungsi untuk prediksi sentimen
def predict_sentiment(texts):
    model.eval()
    results = []
    
    with torch.no_grad():
        for text in tqdm(texts):
            text = preprocess_text(text)
            
            if not text:  # Lewati teks kosong
                results.append({
                    'text': text,
                    'sentiment': "unknown",
                    'confidence': 0.0
                })
                continue
            
            # Tokenisasi teks
            encoded = tokenizer(
                text,
                truncation=True,
                padding=True,
                max_length=512,
                return_tensors="pt"
            )
            
            # Model memprediksi
            outputs = model(encoded["input_ids"], attention_mask=encoded["attention_mask"])
            predictions = F.softmax(outputs.logits, dim=1)
            predicted_label = torch.argmax(predictions, dim=1).item()
            
            # Ambil confidence score
            confidence = predictions[0][predicted_label].item()
            
            # Mapping hasil prediksi ke label
            sentiment = label_map.get(predicted_label, "unknown")
            
            # Menambahkan hasil ke dalam list
            results.append({
                'text': text,
                'sentiment': sentiment,
                'confidence': confidence
            })
    
    return pd.DataFrame(results)

# Gabungkan teks dari kolom-kolom yang diinginkan
columns_to_process = ['OPEN QUESTION 1', 'OPEN QUESTION 2']
all_data_df['Combined_Text'] = all_data_df[columns_to_process].fillna("").apply(lambda row: " ".join(row), axis=1)

# Memanggil fungsi prediksi
results = predict_sentiment(all_data_df['Combined_Text'].tolist())

# Menambahkan hasil prediksi ke dataframe asli menggunakan .loc
all_data_df.loc[:, 'Label'] = results['sentiment']
all_data_df.loc[:, 'Confidence'] = results['confidence']

# Menampilkan hasil
print("\nSample of labeled data:")
print(all_data_df[['NAMA PIC/RESPONDEN', 'OPEN QUESTION 1', 'OPEN QUESTION 2', 'Label', 'Confidence']].head())


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_data_df['Combined_Text'] = all_data_df[columns_to_process].fillna("").apply(lambda row: " ".join(row), axis=1)
100%|██████████| 1531/1531 [02:55<00:00,  8.74it/s]


Sample of labeled data:
          NAMA PIC/RESPONDEN  \
0               Lukman Hakim   
1   Ginanjar Endra Prasetiyo   
2   Friska Fardhina Henryani   
3  Rully Setiawan Purwantoro   
4            Willy Andrianto   

                                     OPEN QUESTION 1  \
0                                        Sangat baik   
1                                         sudah baik   
2                                        sangat baik   
3       DPEA telah menjalankan fungsi sesuai tupoksi   
4  Fungsi pengembangan aplikasi oleh DPEA sudah b...   

                                     OPEN QUESTION 2         Label  Confidence  
0  Perbanyak SDM outsourcing atau research fellow...  cukup setuju    0.217272  
1                               perbanyak programmer        setuju    0.232480  
2                                        sumber daya        setuju    0.214733  
3  DPEA dapat mendukung penggunaan BI tools dalam...        setuju    0.223135  
4                                       




In [87]:
print(all_data_df)

                                ID BIDANG SATKER (AKRONIM)  JENIS SURVEI  \
0              DATABASE INTERNAL_1     MS             DPEA      INTERNAL   
1              DATABASE INTERNAL_6     MS             DPEA      INTERNAL   
2             DATABASE INTERNAL_10     MS             DPEA      INTERNAL   
3             DATABASE INTERNAL_12     MS             DPEA      INTERNAL   
4             DATABASE INTERNAL_14     MS             DPEA      INTERNAL   
...                            ...    ...              ...           ...   
1526  DATABASE TELESURVEY DLIK_267   PEPK             DLIK  NON INTERNAL   
1527  DATABASE TELESURVEY DLIK_269   PEPK             DLIK  NON INTERNAL   
1528  DATABASE TELESURVEY DLIK_270   PEPK             DLIK  NON INTERNAL   
1529  DATABASE TELESURVEY DLIK_276   PEPK             DLIK  NON INTERNAL   
1530  DATABASE TELESURVEY DLIK_277   PEPK             DLIK  NON INTERNAL   

                                     TIPE QUESTION  \
0                                

In [89]:
# Menyimpan data ke file CSV
output_file = "./data/hasil/all_data.csv"
all_data_df.to_csv(output_file, index=False, sep=';', encoding='utf-8-sig')

print(f"Data berhasil disimpan ke {output_file}")

Data berhasil disimpan ke ./data/hasil/all_data.csv


In [None]:
#SAVE MODEL

from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Ganti model_name dengan model IndoBERT
model_name = "indobenchmark/indobert-base-p1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=6  # Sesuaikan jumlah label
)

# Menyimpan model dan tokenizer ke direktori
model_save_path = './saved_model'
tokenizer.save_pretrained(model_save_path)
model.save_pretrained(model_save_path)

print(f"Model dan tokenizer berhasil disimpan di {model_save_path}")

In [None]:
#LOAD MODEL

from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Path ke model yang disimpan
model_save_path = './saved_model'

# Memuat model dan tokenizer dari path yang telah disimpan
tokenizer = AutoTokenizer.from_pretrained(model_save_path)
model = AutoModelForSequenceClassification.from_pretrained(model_save_path)

print(f"Model dan tokenizer berhasil dimuat dari {model_save_path}")

Model dan tokenizer berhasil dimuat dari ./saved_model


In [44]:
label_counts = all_data_df['Label'].value_counts()

label_summary = pd.DataFrame(label_counts).reset_index()
label_summary.columns = ['Label', 'Count']
print(label_summary)

                 Label  Count
0         cukup setuju   1154
1               setuju    252
2  sangat tidak setuju     64
3        sangat setuju     36
4         tidak setuju     25


In [45]:
os.makedirs('data/hasil', exist_ok=True)
all_data_df.to_csv('data/hasil/labeled.csv', index=False, sep=";")

In [None]:
import json
#VALIDASI

file_path = 'data/hasil/all_data.csv'

# Load dataset
data = pd.read_csv(file_path, delimiter=';')

data_copy = data.copy()

# Membaca file JSON yang berisi mapping
with open('mapping.json', 'r') as file:
    text_mapping = json.load(file)


# Fungsi untuk menentukan label berdasarkan kata-kata dalam 'Combined_Text'
def map_combined_text_to_label(text):
    # Iterasi untuk mencari kata kunci dalam text
    for keyword, label in text_mapping.items():
        if keyword in text.lower():  # Mengabaikan kapitalisasi
            return label
    return 'setuju'  # Jika tidak ada kata kunci yang ditemukan

# Terapkan fungsi ke kolom 'Combined_Text' untuk membuat kolom 'Label'
data_copy['New_Label'] = data_copy['Combined_Text'].apply(map_combined_text_to_label)

# Tampilkan dataset yang sudah dimodifikasi
data_copy['Label'] = data_copy['New_Label'].combine_first(data_copy['Label'])

# Simpan dataset yang sudah dimodifikasi
# data_copy.to_csv('data/hasil/validated_dataset_new.csv', index=False, sep=';')

# Menampilkan statistik label setelah perubahan
# label_counts = data_copy['Label'].value_counts()
# label_summary = pd.DataFrame(label_counts).reset_index()
# label_summary.columns = ['Label', 'Count']

label_map = {
    1: "sangat tidak setuju",
    2: "tidak setuju",
    3: "kurang setuju",
    4: "cukup setuju",
    5: "setuju",
    6: "sangat setuju"
}

reverse_label_map = {v: k for k, v in label_map.items()}  

data_copy['Label_Index'] = data_copy['Label'].map(reverse_label_map)

def calculate_weight(index):
    if index == 1:
        return 1
    elif 1.1 <= index <= 2.9:
        return 2.95
    elif 3 <= index <= 3.9:
        return 5.9
    elif 4 <= index <= 6:
        return 6
    else:
        return None  

data_copy['NILAI_SENTIMEN'] = data_copy['Label_Index'].apply(calculate_weight)

# print(data_copy)


print(data_copy[['Label','NILAI_SENTIMEN']])
data_copy.to_csv('data/hasil/main_data.csv', index=False, sep=';')

label_counts = data_copy['Label'].value_counts()
label_summary = pd.DataFrame(label_counts).reset_index()
label_summary.columns = ['Label', 'Count']

In [70]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk

# Pastikan Anda sudah mengunduh stopwords dari NLTK
nltk.download('stopwords')

# Fungsi untuk membersihkan dan menghilangkan stopwords
def clean_text(text):
    # Ubah ke huruf kecil dan tokenisasi
    text = text.lower()
    # Tokenisasi kata
    words = text.split()
    # Menghilangkan stopwords menggunakan NLTK
    stop_words = set(stopwords.words('indonesian'))  # Sesuaikan dengan bahasa dataset
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Asumsikan `all_data_df` sudah memiliki kolom `Combined_Text`
# Gabungkan teks dari kolom yang relevan jika belum ada
columns_to_process = ['NAMA PIC/RESPONDEN', 'OPEN QUESTION 1', 'OPEN QUESTION 2']
all_data_df['Combined_Text'] = all_data_df[columns_to_process].fillna("").apply(lambda row: " ".join(row), axis=1)

# Bersihkan teks dari stopwords
all_data_df['Cleaned_Text'] = all_data_df['Combined_Text'].apply(clean_text)

# Inisialisasi CountVectorizer untuk menghitung frekuensi kata
vectorizer = CountVectorizer(max_features=1000, ngram_range=(1, 1))  # Menggunakan unigrams (kata tunggal)
X = vectorizer.fit_transform(all_data_df['Cleaned_Text'])

# Mendapatkan kata-kata dan frekuensinya
word_counts = X.toarray().sum(axis=0)
words = vectorizer.get_feature_names_out()

# Membuat dataframe untuk menampilkan hasil
word_freq_df = pd.DataFrame(zip(words, word_counts), columns=['Word', 'Frequency'])
word_freq_df = word_freq_df.sort_values(by='Frequency', ascending=False)

# Menampilkan kata-kata kunci yang paling sering muncul
print(word_freq_df.head(10))  # Menampilkan 10 kata teratas


[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_data_df['Combined_Text'] = all_data_df[columns_to_process].fillna("").apply(lambda row: " ".join(row), axis=1)


             Word  Frequency
610           ojk        395
275        fungsi        297
631       pegawai        292
833           sdm        241
670   pengelolaan        226
366     kebijakan        202
320     informasi        198
203  ditingkatkan        196
82           baik        176
922       terkait        161


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_data_df['Cleaned_Text'] = all_data_df['Combined_Text'].apply(clean_text)


In [47]:
os.makedirs('data/hasil', exist_ok=True)
all_data_df.to_csv('data/hasil/cleaned.csv', index=False, sep=";")

In [48]:
data_clean_df = pd.read_csv('data/hasil/cleaned.csv', sep=';')
print(data_clean_df.head())

          NAMA PIC/RESPONDEN  \
0               Lukman Hakim   
1   Ginanjar Endra Prasetiyo   
2   Friska Fardhina Henryani   
3  Rully Setiawan Purwantoro   
4            Willy Andrianto   

                                     OPEN QUESTION 1  \
0                                        Sangat baik   
1                                         sudah baik   
2                                        sangat baik   
3       DPEA telah menjalankan fungsi sesuai tupoksi   
4  Fungsi pengembangan aplikasi oleh DPEA sudah b...   

                                     OPEN QUESTION 2  \
0  Perbanyak SDM outsourcing atau research fellow...   
1                               perbanyak programmer   
2                                        sumber daya   
3  DPEA dapat mendukung penggunaan BI tools dalam...   
4                                                  -   

                                       Combined_Text         Label  \
0  Lukman Hakim Sangat baik Perbanyak SDM outsour...        set

In [49]:
print("Jumlah nilai NaN:", data_clean_df['Cleaned_Text'].isna().sum())

Jumlah nilai NaN: 1


In [50]:
data_clean_df = data_clean_df.dropna(subset=['Cleaned_Text'])

In [51]:
print("Jumlah nilai NaN:", data_clean_df['Cleaned_Text'].isna().sum())

Jumlah nilai NaN: 0


In [52]:
X = data_clean_df['Cleaned_Text']
y = data_clean_df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [53]:
model = SVC(kernel='linear') 
model.fit(X_train_tfidf, y_train)

In [54]:
y_pred = model.predict(X_test_tfidf)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))


Accuracy: 0.8006535947712419
                     precision    recall  f1-score   support

       cukup setuju       0.80      0.98      0.88       222
      sangat setuju       1.00      0.11      0.20         9
sangat tidak setuju       1.00      0.06      0.12        16
             setuju       0.77      0.46      0.58        52
       tidak setuju       1.00      0.29      0.44         7

           accuracy                           0.80       306
          macro avg       0.91      0.38      0.44       306
       weighted avg       0.82      0.80      0.76       306



In [55]:
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2)
grid.fit(X_train_tfidf, y_train)

print(grid.best_estimator_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ...............................C=0.1, kernel=linear; total time=   0.0s
[CV] END ...............................C=0.1, kernel=linear; total time=   0.0s
[CV] END ...............................C=0.1, kernel=linear; total time=   0.0s
[CV] END ...............................C=0.1, kernel=linear; total time=   0.0s
[CV] END ...............................C=0.1, kernel=linear; total time=   0.0s
[CV] END ..................................C=0.1, kernel=rbf; total time=   0.1s
[CV] END ..................................C=0.1, kernel=rbf; total time=   0.1s
[CV] END ..................................C=0.1, kernel=rbf; total time=   0.1s
[CV] END ..................................C=0.1, kernel=rbf; total time=   0.1s
[CV] END ..................................C=0.1, kernel=rbf; total time=   0.1s
[CV] END .................................C=1, kernel=linear; total time=   0.0s
[CV] END .................................C=1, ke

In [60]:
new_data = ["Infrastruktur OJK sudah sangat bagus", "Saya sudah cukup puas dengan kinerja OJK", "OJK sangat bagus"]
new_data_tfidf = vectorizer.transform(new_data)
prediction = model.predict(new_data_tfidf)
print(prediction)

['cukup setuju' 'cukup setuju' 'cukup setuju']
