In [50]:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import nltk
os.makedirs(r'C:\\nltk_data', exist_ok=True)
nltk.data.path.append(r'C:\\nltk_data')  
nltk.download('punkt_tab', download_dir=r'C:\\nltk_data')
nltk.download('stopwords', download_dir=r'C:\\nltk_data')
nltk.download('wordnet', download_dir=r'C:\\nltk_data')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

[nltk_data] Downloading package punkt_tab to C:\\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to C:\\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def read_all_csv_from_directory(directory_path):
    os.makedirs('data', exist_ok=True)
    all_files = os.listdir(directory_path)
    csv_files = [f for f in all_files if f.endswith('.csv')]
    df_list = []
    
    for file in csv_files:
        file_path = os.path.join(directory_path, file)
        try:
            df = pd.read_csv(file_path, sep=';')
            df_list.append(df)
        except Exception as e:
            print(f"Gagal membaca file {file_path}: {e}")
    
    if df_list:
        combined_df = pd.concat(df_list, ignore_index=True)
        return combined_df
    else:
        return pd.DataFrame()

directory_path = 'data'
all_data_df = read_all_csv_from_directory(directory_path)

print(all_data_df.head())

               User                                               Text  \
0      @iniagustinn  UDAAHHH : "Ayo ayooo bangun, ruang tamu belom ...   
1  @BeginiCeritany1     Dah macam rentenir yg lolos pengawasan ojk ...   
2       @bukanhadee                    tanyain balik “kenapa bandung?”   
3     @ojkindonesia  Kembali Fitri di Hari Kemenangan Sobat OJK, Bu...   
4       @bukanhadee                                   mending koe bagi   

                       Date  \
0  2024-04-09T23:34:31.000Z   
1  2024-04-09T23:27:12.000Z   
2  2024-04-09T23:18:03.000Z   
3  2024-04-09T23:07:26.000Z   
4  2024-04-09T18:51:12.000Z   

                                          Tweet Link Media Links  \
0  https://x.com/iniagustinn/status/1777842559857...          []   
1  https://x.com/BeginiCeritany1/status/177784071...          []   
2  https://x.com/bukanhadee/status/17778384162329...          []   
3  https://x.com/ojkindonesia/status/177783574169...          []   
4  https://x.com/bukanhadee/

In [3]:
print(all_data_df.isnull().sum())

User              0
Text              0
Date              0
Tweet Link        0
Media Links       0
Reply To       6504
dtype: int64


In [4]:
all_data_df = all_data_df[['User', 'Text', 'Date']]
all_data_df['Date'] = pd.to_datetime(all_data_df['Date'])

In [5]:
print(all_data_df.isnull().sum())

User    0
Text    0
Date    0
dtype: int64


In [6]:
all_data_df.describe()

Unnamed: 0,User,Text,Date
count,13825,13825,13825
unique,2686,5742,
top,@bukanhadee,list pinjol legal dari OJK yang terbukti bisa ...,
freq,926,32,
mean,,,2024-05-11 08:42:11.258589696+00:00
min,,,2024-04-06 03:09:32+00:00
25%,,,2024-05-02 14:35:39+00:00
50%,,,2024-05-14 18:16:20+00:00
75%,,,2024-05-20 19:32:22+00:00
max,,,2024-05-30 23:56:26+00:00


In [7]:
def label_sentiment(text):
    very_positive_keywords = ["luar biasa", "fantastis", "sangat bagus", "hebat"]
    positive_keywords = ["bagus", "baik", "menarik", "aman", "gratis"]
    neutral_keywords = ["normal", "biasanya", "biasa", "netral"]
    negative_keywords = ["buruk", "salah", "hilang", "tidak"]
    very_negative_keywords = ["sangat buruk", "jelek", "penipuan", "merugikan"]

    text = text.lower()
    
    if any(keyword in text for keyword in very_positive_keywords):
        return 'sangat positif'
    elif any(keyword in text for keyword in positive_keywords):
        return 'positif'
    elif any(keyword in text for keyword in neutral_keywords):
        return 'netral'
    elif any(keyword in text for keyword in negative_keywords):
        return 'negatif'
    elif any(keyword in text for keyword in very_negative_keywords):
        return 'sangat negatif'
    else:
        return 'netral'

all_data_df['Label'] = all_data_df['Text'].apply(label_sentiment)

all_data_df[['Text', 'Label']]

Unnamed: 0,Text,Label
0,"UDAAHHH : ""Ayo ayooo bangun, ruang tamu belom ...",netral
1,Dah macam rentenir yg lolos pengawasan ojk ...,netral
2,tanyain balik “kenapa bandung?”,netral
3,"Kembali Fitri di Hari Kemenangan Sobat OJK, Bu...",netral
4,mending koe bagi,netral
...,...,...
13820,kemaren smpet baca. bpr2 udah banyak yg kolap....,netral
13821,Lah org OJK. Pantes ada kenalan seangkatan kul...,netral
13822,emg superbank tuh aman? dan diawasi ojk ga?,positif
13823,"Halo, @qafdhi . Mohon maaf terkait hal tersebu...",negatif


In [8]:
label_counts = all_data_df['Label'].value_counts()

label_summary = pd.DataFrame(label_counts).reset_index()
label_summary.columns = ['Label', 'Count']
print(label_summary)

            Label  Count
0          netral  11032
1         positif   1625
2         negatif    964
3  sangat negatif    159
4  sangat positif     45


In [9]:
os.makedirs('data/hasil', exist_ok=True)
all_data_df.to_csv('data/hasil/labeled.csv', index=False, sep=";")

In [10]:
def text_preprocessing(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+|\@\w+|\#|\d+|[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    stop_words = set(stopwords.words('indonesian'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    cleaned_text = ' '.join(words)
    
    return cleaned_text

In [11]:
all_data_df['Cleaned_Text'] = all_data_df['Text'].apply(text_preprocessing)

all_data_df[['Text', 'Cleaned_Text']]

Unnamed: 0,Text,Cleaned_Text
0,"UDAAHHH : ""Ayo ayooo bangun, ruang tamu belom ...",udaahhh ayo ayooo bangun ruang tamu belom bere...
1,Dah macam rentenir yg lolos pengawasan ojk ...,dah rentenir yg lolo pengawasan ojk
2,tanyain balik “kenapa bandung?”,tanyain bandung
3,"Kembali Fitri di Hari Kemenangan Sobat OJK, Bu...",fitri kemenangan sobat ojk ramadan sambut keme...
4,mending koe bagi,mending koe
...,...,...
13820,kemaren smpet baca. bpr2 udah banyak yg kolap....,kemaren smpet baca bpr udah yg kolap ojk ngaba...
13821,Lah org OJK. Pantes ada kenalan seangkatan kul...,org ojk pantes kenalan seangkatan kuliah org o...
13822,emg superbank tuh aman? dan diawasi ojk ga?,emg superbank tuh aman diawasi ojk ga
13823,"Halo, @qafdhi . Mohon maaf terkait hal tersebu...",halo mohon maaf terkait dibantu layanan silaka...


In [12]:
os.makedirs('data/hasil', exist_ok=True)
all_data_df.to_csv('data/hasil/cleaned.csv', index=False, sep=";")

In [48]:
data_clean_df = pd.read_csv('data/hasil/cleaned.csv', sep=';')
print(data_clean_df.head())

               User                                               Text  \
0      @iniagustinn  UDAAHHH : "Ayo ayooo bangun, ruang tamu belom ...   
1  @BeginiCeritany1     Dah macam rentenir yg lolos pengawasan ojk ...   
2       @bukanhadee                    tanyain balik “kenapa bandung?”   
3     @ojkindonesia  Kembali Fitri di Hari Kemenangan Sobat OJK, Bu...   
4       @bukanhadee                                   mending koe bagi   

                        Date   Label  \
0  2024-04-09 23:34:31+00:00  netral   
1  2024-04-09 23:27:12+00:00  netral   
2  2024-04-09 23:18:03+00:00  netral   
3  2024-04-09 23:07:26+00:00  netral   
4  2024-04-09 18:51:12+00:00  netral   

                                        Cleaned_Text  
0  udaahhh ayo ayooo bangun ruang tamu belom bere...  
1                dah rentenir yg lolo pengawasan ojk  
2                                    tanyain bandung  
3  fitri kemenangan sobat ojk ramadan sambut keme...  
4                                      

In [57]:
print("Jumlah nilai NaN:", data_clean_df['Cleaned_Text'].isna().sum())

Jumlah nilai NaN: 15


In [58]:
data_clean_df = data_clean_df.dropna(subset=['Cleaned_Text'])

In [64]:
print("Jumlah nilai NaN:", data_clean_df['Cleaned_Text'].isna().sum())


Jumlah nilai NaN: 0


In [68]:
X = data_clean_df['Cleaned_Text']
y = data_clean_df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [69]:
model = SVC(kernel='linear') 
model.fit(X_train_tfidf, y_train)

In [70]:
y_pred = model.predict(X_test_tfidf)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))


Accuracy: 0.9656046343229544
                precision    recall  f1-score   support

       negatif       0.96      0.71      0.82       189
        netral       0.96      1.00      0.98      2209
       positif       0.99      0.90      0.94       320
sangat negatif       0.97      0.94      0.96        35
sangat positif       1.00      0.78      0.88         9

      accuracy                           0.97      2762
     macro avg       0.98      0.87      0.91      2762
  weighted avg       0.97      0.97      0.96      2762



In [20]:
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2)
grid.fit(X_train_tfidf, y_train)

print(grid.best_estimator_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ...............................C=0.1, kernel=linear; total time=  17.8s
[CV] END ...............................C=0.1, kernel=linear; total time=  13.5s
[CV] END ...............................C=0.1, kernel=linear; total time=  14.3s
[CV] END ...............................C=0.1, kernel=linear; total time=  12.6s
[CV] END ...............................C=0.1, kernel=linear; total time=  11.1s
[CV] END ..................................C=0.1, kernel=rbf; total time=  17.2s
[CV] END ..................................C=0.1, kernel=rbf; total time=  17.2s
[CV] END ..................................C=0.1, kernel=rbf; total time=  17.2s
[CV] END ..................................C=0.1, kernel=rbf; total time=  19.4s
[CV] END ..................................C=0.1, kernel=rbf; total time=  18.6s
[CV] END .................................C=1, kernel=linear; total time=   9.6s
[CV] END .................................C=1, ke

In [46]:
new_data = ["jujur ojk kerjaannya sih"]
new_data_tfidf = vectorizer.transform(new_data)
prediction = model.predict(new_data_tfidf)
print(prediction)

['netral']
