# Necessary Import 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os
import re
import string
import json
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


import nltk
os.makedirs(r'C:\\nltk_data', exist_ok=True)
nltk.data.path.append(r'C:\\nltk_data')  
nltk.download('punkt_tab', download_dir=r'C:\\nltk_data')
nltk.download('stopwords', download_dir=r'C:\\nltk_data')
nltk.download('wordnet', download_dir=r'C:\\nltk_data')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn.functional as F
from tqdm import tqdm

In [None]:
def read_database_result_csv(file_path):
    try:
        df = pd.read_csv(file_path, sep=';', encoding='utf-8')
    except UnicodeDecodeError as e1:
        try:
            df = pd.read_csv(file_path, sep=';', encoding='latin1')
        except UnicodeDecodeError as e2:
            try:
                df = pd.read_csv(file_path, sep=';', encoding='cp1252')
            except Exception as e3:
                print(f"Gagal membaca file {file_path}: {e1} | {e2} | {e3}")
                return pd.DataFrame()  
    return df

file_path = 'data/main_data_19.csv'
database_result_df = read_database_result_csv(file_path)

print(database_result_df)

# Necessary Column

In [3]:
columns_order = [
    'ID',
    'BIDANG',
    'SATKER (AKRONIM)',
    'JENIS SURVEI',
    'TIPE QUESTION',
    'INSTITUSI / PERSEORANGAN/ASAL SATKER',
    'RESPOND',
    'LINK SURVEYMONKEY',
    'TOKEN',
    'NAMA PIC/RESPONDEN',
    'JABATAN/PROFESI/LVEL DI OJK',
    'KONTAK',
    'FUNGSI YANG DINILAI',
    'DIRECT / INDIRECT',
    'JENIS STAKEHOLDERS',
    'RELASI RESPONDEN DENGAN SATKER',
    'POWER',
    'INTEREST',
    'KATEGORI',
    'Dataset',
    'RESOURCE PERCEPTION',
    'PERFORMANCE DELIVERY',
    'OPEN QUESTION 1',
    'OPEN QUESTION 2'
]

In [None]:
all_data_idi_df = database_result_df[columns_order]
print(all_data_idi_df)

# Cleaning Dataset

In [None]:
print(all_data_idi_df.isnull().sum())

In [None]:
all_data_idi_df.fillna("-", inplace=True)
print(all_data_idi_df.isnull().sum())

In [None]:
all_data_idi_df.describe()

# Preprocessing Dataset

In [8]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+|\@\w+|\#|\d+|[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    stop_words = set(stopwords.words('indonesian'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    cleaned_text = ' '.join(words)
    
    return cleaned_text

# Train Dataset with Pre-Trained Model

In [None]:
model_name = "indobenchmark/indobert-base-p2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=6  
)

label_map = {
    0: "sangat tidak setuju",
    1: "tidak setuju",
    2: "kurang setuju",
    3: "cukup setuju",
    4: "setuju",
    5: "sangat setuju"
}

def preprocess_text(text):
    return text.strip().lower() if isinstance(text, str) else ""  

def predict_sentiment(texts):
    model.eval()
    results = []
    
    with torch.no_grad():
        for text in tqdm(texts):
            text = preprocess_text(text)
            
            if not text:  
                results.append({
                    'text': text,
                    'sentiment': "unknown",
                    'confidence': 0.0
                })
                continue
            
            encoded = tokenizer(
                text,
                truncation=True,
                padding=True,
                max_length=512,
                return_tensors="pt"
            )
            
            outputs = model(encoded["input_ids"], attention_mask=encoded["attention_mask"])
            predictions = F.softmax(outputs.logits, dim=1)
            predicted_label = torch.argmax(predictions, dim=1).item()
            
            confidence = predictions[0][predicted_label].item()
            
            sentiment = label_map.get(predicted_label, "unknown")
            
            results.append({
                'text': text,
                'sentiment': sentiment,
                'confidence': confidence
            })
    
    return pd.DataFrame(results)

columns_to_process = ['OPEN QUESTION 1','OPEN QUESTION 2']

all_data_idi_df['Text'] = all_data_idi_df[columns_to_process].fillna("").apply(lambda row: " ".join(row), axis=1)

results = predict_sentiment(all_data_idi_df['Text'].tolist())

all_data_idi_df.loc[:, 'Label'] = results['sentiment']

print("\nSample of labeled data:")
print(all_data_idi_df[['OPEN QUESTION 1','OPEN QUESTION 2','Label']].head())


In [None]:
all_data_idi_df.to_csv('data/hasil/main_data_19_OQ2.csv', index=False, sep=';')
print(all_data_idi_df)

# Output Label 

In [None]:
label_counts = all_data_idi_df['Label'].value_counts()

label_summary = pd.DataFrame(label_counts).reset_index()
label_summary.columns = ['Label', 'Count']
print(label_summary)

In [None]:
X = all_data_idi_df['Text']
y = all_data_idi_df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)