# Necessary Import 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os
import re
import string
import json
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


import nltk
os.makedirs(r'C:\\nltk_data', exist_ok=True)
nltk.data.path.append(r'C:\\nltk_data')  
nltk.download('punkt_tab', download_dir=r'C:\\nltk_data')
nltk.download('stopwords', download_dir=r'C:\\nltk_data')
nltk.download('wordnet', download_dir=r'C:\\nltk_data')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn.functional as F
from tqdm import tqdm

[nltk_data] Downloading package punkt_tab to C:\\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to C:\\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to C:\\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def read_database_result_csv(file_path):
    try:
        df = pd.read_csv(file_path, sep=';', encoding='utf-8')
    except UnicodeDecodeError as e1:
        try:
            df = pd.read_csv(file_path, sep=';', encoding='latin1')
        except UnicodeDecodeError as e2:
            try:
                df = pd.read_csv(file_path, sep=';', encoding='cp1252')
            except Exception as e3:
                print(f"Gagal membaca file {file_path}: {e1} | {e2} | {e3}")
                return pd.DataFrame()  
    return df

file_path = 'data/main_data_19.csv'
database_result_df = read_database_result_csv(file_path)

print(database_result_df)

                  Dataset                      ID BIDANG SATKER (AKRONIM)  \
0       Database Internal    DATABASE INTERNAL_23     MS             DPSI   
1       Database Internal    DATABASE INTERNAL_24     MS             DPSI   
2       Database Internal    DATABASE INTERNAL_25     MS             DPSI   
3       Database Internal    DATABASE INTERNAL_26     MS             DPSI   
4       Database Internal    DATABASE INTERNAL_28     MS             DPSI   
...                   ...                     ...    ...              ...   
8242  DATABASE KOJK NO HP  EKSTERNAL_KOJK_HP_5221     KS       KR3 - KOSG   
8243  DATABASE KOJK NO HP  EKSTERNAL_KOJK_HP_5222     KS       KR3 - KOSG   
8244  DATABASE KOJK NO HP  EKSTERNAL_KOJK_HP_5230     KS       KR3 - KOSG   
8245  DATABASE KOJK NO HP  EKSTERNAL_KOJK_HP_5232     KS       KR3 - KOSG   
8246  DATABASE KOJK NO HP  EKSTERNAL_KOJK_HP_5235     KS       KR3 - KOSG   

      JENIS SURVEI                           TIPE QUESTION  \
0         INT

In [5]:
columns_order = [
    'ID',
    'BIDANG',
    'SATKER (AKRONIM)',
    'JENIS SURVEI',
    'TIPE QUESTION',
    'INSTITUSI / PERSEORANGAN/ASAL SATKER',
    'RESPOND',
    'LINK SURVEYMONKEY',
    'TOKEN',
    'NAMA PIC/RESPONDEN',
    'JABATAN/PROFESI/LVEL DI OJK',
    'KONTAK',
    'FUNGSI YANG DINILAI',
    'DIRECT / INDIRECT',
    'JENIS STAKEHOLDERS',
    'RELASI RESPONDEN DENGAN SATKER',
    'POWER',
    'INTEREST',
    'KATEGORI',
    'Dataset',
    'RESOURCE PERCEPTION',
    'PERFORMANCE DELIVERY',
    'OPEN QUESTION 1',
    'OPEN QUESTION 2'
]

In [7]:
all_data_idi_df = database_result_df[columns_order]
print(all_data_idi_df)

                          ID BIDANG SATKER (AKRONIM)  JENIS SURVEI  \
0       DATABASE INTERNAL_23     MS             DPSI      INTERNAL   
1       DATABASE INTERNAL_24     MS             DPSI      INTERNAL   
2       DATABASE INTERNAL_25     MS             DPSI      INTERNAL   
3       DATABASE INTERNAL_26     MS             DPSI      INTERNAL   
4       DATABASE INTERNAL_28     MS             DPSI      INTERNAL   
...                      ...    ...              ...           ...   
8242  EKSTERNAL_KOJK_HP_5221     KS       KR3 - KOSG  NON INTERNAL   
8243  EKSTERNAL_KOJK_HP_5222     KS       KR3 - KOSG  NON INTERNAL   
8244  EKSTERNAL_KOJK_HP_5230     KS       KR3 - KOSG  NON INTERNAL   
8245  EKSTERNAL_KOJK_HP_5232     KS       KR3 - KOSG  NON INTERNAL   
8246  EKSTERNAL_KOJK_HP_5235     KS       KR3 - KOSG  NON INTERNAL   

                               TIPE QUESTION  \
0                                DIRECT DPSI   
1                                DIRECT DPSI   
2              

In [8]:
print(all_data_idi_df.isnull().sum())

ID                                         0
BIDANG                                     0
SATKER (AKRONIM)                           0
JENIS SURVEI                               0
TIPE QUESTION                              0
INSTITUSI / PERSEORANGAN/ASAL SATKER       3
RESPOND                                    0
LINK SURVEYMONKEY                          0
TOKEN                                      0
NAMA PIC/RESPONDEN                        14
JABATAN/PROFESI/LVEL DI OJK              239
KONTAK                                   185
FUNGSI YANG DINILAI                        0
DIRECT / INDIRECT                          0
JENIS STAKEHOLDERS                         3
RELASI RESPONDEN DENGAN SATKER           941
POWER                                      0
INTEREST                                   0
KATEGORI                                   0
Dataset                                    0
RESOURCE PERCEPTION                     3793
PERFORMANCE DELIVERY                    3793
OPEN QUEST

In [9]:
all_data_idi_df.fillna("-", inplace=True)
print(all_data_idi_df.isnull().sum())

ID                                      0
BIDANG                                  0
SATKER (AKRONIM)                        0
JENIS SURVEI                            0
TIPE QUESTION                           0
INSTITUSI / PERSEORANGAN/ASAL SATKER    0
RESPOND                                 0
LINK SURVEYMONKEY                       0
TOKEN                                   0
NAMA PIC/RESPONDEN                      0
JABATAN/PROFESI/LVEL DI OJK             0
KONTAK                                  0
FUNGSI YANG DINILAI                     0
DIRECT / INDIRECT                       0
JENIS STAKEHOLDERS                      0
RELASI RESPONDEN DENGAN SATKER          0
POWER                                   0
INTEREST                                0
KATEGORI                                0
Dataset                                 0
RESOURCE PERCEPTION                     0
PERFORMANCE DELIVERY                    0
OPEN QUESTION 1                         0
OPEN QUESTION 2                   

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_data_idi_df.fillna("-", inplace=True)


In [10]:
all_data_idi_df.describe()

Unnamed: 0,ID,BIDANG,SATKER (AKRONIM),JENIS SURVEI,TIPE QUESTION,INSTITUSI / PERSEORANGAN/ASAL SATKER,RESPOND,LINK SURVEYMONKEY,TOKEN,NAMA PIC/RESPONDEN,...,JENIS STAKEHOLDERS,RELASI RESPONDEN DENGAN SATKER,POWER,INTEREST,KATEGORI,Dataset,RESOURCE PERCEPTION,PERFORMANCE DELIVERY,OPEN QUESTION 1,OPEN QUESTION 2
count,8247,8247,8247,8247,8247,8247,8247,8247,8247,8247,...,8247,8247,8247,8247,8247,8247,8247,8247,8247,8247
unique,8247,9,56,2,73,1437,1,25,237,4040,...,183,238,3,3,4,7,26,90,3269,4538
top,EKSTERNAL_KOJK_HP_5235,MS,DPSI,INTERNAL,DIRECT DPSI,DPW1,SUDAH DI ISI,"Internal 1 (Direct DPSI & Indirect DOSB, 253_2...",Token 0252,-,...,PEGAWAI OJK,PENERIMA HELPDESK & LAYANAN DPSI LAINNYA,HIGH,HIGH,PLAYER,Database Internal,-,-,-,-
freq,1,5362,2474,5940,2458,205,8247,3334,3334,14,...,5919,2458,5364,7722,5117,4106,3793,3793,2446,2822


In [11]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+|\@\w+|\#|\d+|[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    stop_words = set(stopwords.words('indonesian'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    cleaned_text = ' '.join(words)
    
    return cleaned_text

In [13]:
def predict_sentiment_with_custom_mapping(df, text_columns, model_name):
    # Inisialisasi tokenizer dan model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=6  # Sesuaikan dengan jumlah label Anda
    )
    
    # Definisi label mapping
    label_map = {
        0: "sangat tidak setuju",
        1: "tidak setuju",
        2: "kurang setuju",
        3: "cukup setuju", 
        4: "setuju",
        5: "sangat setuju"
    }
    
    # Fungsi preprocessing khusus
    def preprocess_text(text):
        # Jika teks adalah "-", kembalikan "-"
        if isinstance(text, str) and text.strip() == "-":
            return "-"
        # Proses teks normal
        return text.strip().lower() if isinstance(text, str) else ""

    # Gabungkan kolom teks
    df['Text'] = df[text_columns].fillna("-").apply(lambda row: " ".join(row), axis=1)
    
    # Fungsi prediksi sentimen
    def predict_single_text(text):
        # Jika teks adalah "-", kembalikan "setuju"
        if text == "-":
            return {
                'text': text,
                'sentiment': "setuju",
                'confidence': 1.0
            }
        
        # Tokenisasi
        encoded = tokenizer(
            text,
            truncation=True,
            padding=True,
            max_length=512,
            return_tensors="pt"
        )
        
        # Prediksi
        model.eval()
        with torch.no_grad():
            outputs = model(encoded["input_ids"], attention_mask=encoded["attention_mask"])
            predictions = F.softmax(outputs.logits, dim=1)
            predicted_label = torch.argmax(predictions, dim=1).item()
            
            # Ambil confidence score
            confidence = predictions[0][predicted_label].item()
            
            # Mapping label
            sentiment = label_map.get(predicted_label, "unknown")
            
            return {
                'text': text,
                'sentiment': sentiment,
                'confidence': confidence
            }
    
    # Proses sentimen untuk seluruh dataset
    results = []
    for text in tqdm(df['Text'], desc=f"Analyzing Sentiments with {model_name}"):
        processed_text = preprocess_text(text)
        result = predict_single_text(processed_text)
        results.append(result)
    
    # Konversi hasil ke DataFrame
    results_df = pd.DataFrame(results)
    
    # Update DataFrame asli
    df['Label'] = results_df['sentiment']
    df['Confidence'] = results_df['confidence']
    
    return df

# Model-model untuk diuji
alternative_models = [
    "indobenchmark/indobert-base-p1",
    "bert-base-multilingual-uncased",
    "indobenchmark/indobert-base-p2", 
    "NLP-Cube-Lab/indonesian-sentiment-bert",
    "kambaa/indonesian-sentiment-analysis-bert"
]

# Kolom untuk diproses
columns_to_process = ['OPEN QUESTION 1', 'OPEN QUESTION 2']

# Simpan hasil untuk setiap model
results_summary = {}

# Uji coba setiap model
for model_name in alternative_models:
    try:
        print(f"\n--- Menggunakan Model: {model_name} ---")
        
        # Buat salinan dataframe untuk setiap iterasi
        df_copy = all_data_idi_df.copy()
        
        # Jalankan analisis sentimen
        result_df = predict_sentiment_with_custom_mapping(
            df_copy, 
            columns_to_process, 
            model_name
        )
        
        # Tampilkan distribusi label
        label_counts = result_df['Label'].value_counts()
        print("\nDistribusi Label Sentimen:")
        print(label_counts)
        
        # Simpan hasil
        output_filename = f'data/hasil/sentiment_{model_name.replace("/", "_")}_custom.csv'
        result_df.to_csv(output_filename, index=False, sep=';')
        print(f"\nHasil disimpan di: {output_filename}")
        
        # Simpan ringkasan
        results_summary[model_name] = {
            'label_distribution': label_counts.to_dict(),
            'output_file': output_filename
        }
        
        # Tampilkan beberapa sampel
        print("\nSampel Hasil:")
        print(result_df[['OPEN QUESTION 1', 'OPEN QUESTION 2', 'Label', 'Confidence']].head())
        
    except Exception as e:
        print(f"Error dengan model {model_name}: {e}")

# Simpan ringkasan hasil
import json
with open('data/hasil/sentiment_models_summary.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

print("\n--- Ringkasan Hasil Tersimpan ---")



--- Menggunakan Model: indobenchmark/indobert-base-p1 ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Analyzing Sentiments with indobenchmark/indobert-base-p1:   2%|▏         | 136/8247 [00:12<12:29, 10.83it/s]


KeyboardInterrupt: 

In [None]:
all_data_idi_df.to_csv('data/hasil/main_data_19_OQ2.csv', index=False, sep=';')
print(all_data_idi_df)

In [None]:
# cek lagi
label_counts = all_data_idi_df['Label'].value_counts()

label_summary = pd.DataFrame(label_counts).reset_index()
label_summary.columns = ['Label', 'Count']
print(label_summary)