In [2]:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import nltk
os.makedirs(r'C:\\nltk_data', exist_ok=True)
nltk.data.path.append(r'C:\\nltk_data')  
nltk.download('punkt_tab', download_dir=r'C:\\nltk_data')
nltk.download('stopwords', download_dir=r'C:\\nltk_data')
nltk.download('wordnet', download_dir=r'C:\\nltk_data')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

[nltk_data] Downloading package punkt_tab to C:\\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to C:\\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
def read_all_csv_from_directory(directory_path):
    os.makedirs('data', exist_ok=True)
    all_files = os.listdir(directory_path)
    csv_files = [f for f in all_files if f.endswith('.csv')]
    df_list = []
    
    for file in csv_files:
        file_path = os.path.join(directory_path, file)
        try:
            df = pd.read_csv(file_path, sep=';')
            df_list.append(df)
        except Exception as e:
            print(f"Gagal membaca file {file_path}: {e}")
    
    if df_list:
        combined_df = pd.concat(df_list, ignore_index=True)
        return combined_df
    else:
        return pd.DataFrame()

directory_path = 'data'
all_data_df = read_all_csv_from_directory(directory_path)

print(all_data_df)

                   User                                               Text  \
0          @teph_tephie  kalo nawarin dan promo sampe bertubi2, tapi gi...   
1      @bebanpenyusutan  Akun kopongan sejak september khusus buat stri...   
2           @mrtweepsID  Keterlambatan OJK dalam penanganan pinjol ini ...   
3           @mrtweepsID  Kami mewakili korban pinjol tidak setuju atas ...   
4           @mrtweepsID  Kepada Yth: @BPKN_RI @KomnasHAM Berdasarkan ar...   
...                 ...                                                ...   
47663          @bayuart  kemaren smpet baca. bpr2 udah banyak yg kolap....   
47664    @gustussutsuga  Lah org OJK. Pantes ada kenalan seangkatan kul...   
47665      @kardinafisa        emg superbank tuh aman? dan diawasi ojk ga?   
47666        @XpresiBCA  Halo, @qafdhi . Mohon maaf terkait hal tersebu...   
47667      @FAIZULDGN33  Otoritas jasa keuangan (OJK) bersama pemerinta...   

                           Date  \
0      2023-11-29T23:57:11.0

In [17]:
print(all_data_df.isnull().sum())

User               0
Text               0
Date               0
Tweet Link         0
Media Links        0
Reply To       23272
dtype: int64


In [18]:
all_data_df = all_data_df[['User', 'Text', 'Date']]
all_data_df['Date'] = pd.to_datetime(all_data_df['Date'])

In [19]:
print(all_data_df.isnull().sum())

User    0
Text    0
Date    0
dtype: int64


In [20]:
all_data_df.describe()

Unnamed: 0,User,Text,Date
count,47668,47668,47668
unique,6653,16813,
top,@bukanhadee,Gais jangan dong pake pinjol ilegal ! kalo ben...,
freq,3025,100,
mean,,,2024-02-13 12:32:23.938847488+00:00
min,,,2023-11-25 08:40:34+00:00
25%,,,2023-12-28 04:12:15.750000128+00:00
50%,,,2024-01-24 00:11:53+00:00
75%,,,2024-04-28 01:37:20+00:00
max,,,2024-05-30 23:56:26+00:00


In [21]:
# def label_sentiment(text):
#     very_positive_keywords = ["luar biasa", "fantastis", "sangat bagus", "hebat"]
#     positive_keywords = ["bagus", "baik", "menarik", "aman", "gratis"]
#     neutral_keywords = ["normal", "biasanya", "biasa", "netral"]
#     negative_keywords = ["buruk", "salah", "hilang", "tidak"]
#     very_negative_keywords = ["sangat buruk", "jelek", "penipuan", "merugikan"]

#     text = text.lower()
    
#     if any(keyword in text for keyword in very_positive_keywords):
#         return 'sangat positif'
#     elif any(keyword in text for keyword in positive_keywords):
#         return 'positif'
#     elif any(keyword in text for keyword in neutral_keywords):
#         return 'netral'
#     elif any(keyword in text for keyword in negative_keywords):
#         return 'negatif'
#     elif any(keyword in text for keyword in very_negative_keywords):
#         return 'sangat negatif'
#     else:
#         return 'netral'

# all_data_df['Label'] = all_data_df['Text'].apply(label_sentiment)

# all_data_df[['Text', 'Label']]

def label_sentiment(text):
    very_positive_keywords = ["luar biasa", "fantastis", "sangat bagus", "hebat", "terpercaya", "sangat membantu", "sangat bermanfaat"]
    positive_keywords = ["bagus", "baik", "menarik", "aman", "bermanfaat", "membantu", "mendukung", "profesional"]
    neutral_keywords = ["normal", "biasa", "netral", "standar", "umum", "cukup"]
    negative_keywords = ["buruk", "kurang", "lambat", "rumit", "membingungkan", "tidak jelas"]
    very_negative_keywords = ["sangat buruk", "sangat kurang", "tidak kompeten", "merugikan", "berbahaya", "tidak terpercaya"]

    text = text.lower()
    
    negations = ["tidak", "bukan", "kurang", "belum"]
    words = text.split()
    for i, word in enumerate(words):
        if word in negations and i + 1 < len(words):
            words[i+1] = "NEG_" + words[i+1]
    text = " ".join(words)
    
    very_positive_count = sum(1 for keyword in very_positive_keywords if keyword in text) - sum(1 for keyword in very_positive_keywords if "NEG_" + keyword in text)
    positive_count = sum(1 for keyword in positive_keywords if keyword in text) - sum(1 for keyword in positive_keywords if "NEG_" + keyword in text)
    neutral_count = sum(1 for keyword in neutral_keywords if keyword in text)
    negative_count = sum(1 for keyword in negative_keywords if keyword in text) - sum(1 for keyword in negative_keywords if "NEG_" + keyword in text)
    very_negative_count = sum(1 for keyword in very_negative_keywords if keyword in text) - sum(1 for keyword in very_negative_keywords if "NEG_" + keyword in text)
    
    counts = {
        'sangat positif': very_positive_count,
        'positif': positive_count,
        'netral': neutral_count,
        'negatif': negative_count,
        'sangat negatif': very_negative_count
    }
    
    max_sentiment = max(counts, key=counts.get)
    
    if counts[max_sentiment] == 0 or len(set(counts.values())) == 1:
        return 'netral'
    
    return max_sentiment



all_data_df['Label'] = all_data_df['Text'].apply(label_sentiment)

print(all_data_df[['Text', 'Label']])


                                                    Text    Label
0      kalo nawarin dan promo sampe bertubi2, tapi gi...  positif
1      Akun kopongan sejak september khusus buat stri...   netral
2      Keterlambatan OJK dalam penanganan pinjol ini ...  negatif
3      Kami mewakili korban pinjol tidak setuju atas ...   netral
4      Kepada Yth: @BPKN_RI @KomnasHAM Berdasarkan ar...   netral
...                                                  ...      ...
47663  kemaren smpet baca. bpr2 udah banyak yg kolap....   netral
47664  Lah org OJK. Pantes ada kenalan seangkatan kul...   netral
47665        emg superbank tuh aman? dan diawasi ojk ga?  positif
47666  Halo, @qafdhi . Mohon maaf terkait hal tersebu...   netral
47667  Otoritas jasa keuangan (OJK) bersama pemerinta...   netral

[47668 rows x 2 columns]


In [23]:
label_counts = all_data_df['Label'].value_counts()

label_summary = pd.DataFrame(label_counts).reset_index()
label_summary.columns = ['Label', 'Count']
print(label_summary)

            Label  Count
0          netral  40431
1         positif   6353
2         negatif    585
3  sangat positif    259
4  sangat negatif     40


In [24]:
os.makedirs('data/hasil', exist_ok=True)
all_data_df.to_csv('data/hasil/labeled.csv', index=False, sep=";")

In [25]:
def text_preprocessing(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+|\@\w+|\#|\d+|[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    stop_words = set(stopwords.words('indonesian'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    cleaned_text = ' '.join(words)
    
    return cleaned_text

In [26]:
all_data_df['Cleaned_Text'] = all_data_df['Text'].apply(text_preprocessing)

all_data_df[['Text', 'Cleaned_Text']]

Unnamed: 0,Text,Cleaned_Text
0,"kalo nawarin dan promo sampe bertubi2, tapi gi...",kalo nawarin promo sampe bertubi giliran uda p...
1,Akun kopongan sejak september khusus buat stri...,akun kopongan september khusus strimingku aje ...
2,Keterlambatan OJK dalam penanganan pinjol ini ...,keterlambatan ojk penanganan pinjol memprihati...
3,Kami mewakili korban pinjol tidak setuju atas ...,mewakili korban pinjol setuju penyampaian ojk ...
4,Kepada Yth: @BPKN_RI @KomnasHAM Berdasarkan ar...,yth berdasarkan artikel yg peroleh medium nasi...
...,...,...
47663,kemaren smpet baca. bpr2 udah banyak yg kolap....,kemaren smpet baca bpr udah yg kolap ojk ngaba...
47664,Lah org OJK. Pantes ada kenalan seangkatan kul...,org ojk pantes kenalan seangkatan kuliah org o...
47665,emg superbank tuh aman? dan diawasi ojk ga?,emg superbank tuh aman diawasi ojk ga
47666,"Halo, @qafdhi . Mohon maaf terkait hal tersebu...",halo mohon maaf terkait dibantu layanan silaka...


In [27]:
os.makedirs('data/hasil', exist_ok=True)
all_data_df.to_csv('data/hasil/cleaned.csv', index=False, sep=";")

In [28]:
data_clean_df = pd.read_csv('data/hasil/cleaned.csv', sep=';')
print(data_clean_df.head())

               User                                               Text  \
0      @teph_tephie  kalo nawarin dan promo sampe bertubi2, tapi gi...   
1  @bebanpenyusutan  Akun kopongan sejak september khusus buat stri...   
2       @mrtweepsID  Keterlambatan OJK dalam penanganan pinjol ini ...   
3       @mrtweepsID  Kami mewakili korban pinjol tidak setuju atas ...   
4       @mrtweepsID  Kepada Yth: @BPKN_RI @KomnasHAM Berdasarkan ar...   

                        Date    Label  \
0  2023-11-29 23:57:11+00:00  positif   
1  2023-11-29 22:27:52+00:00   netral   
2  2023-11-29 20:58:25+00:00  negatif   
3  2023-11-29 20:56:06+00:00   netral   
4  2023-11-29 20:53:35+00:00   netral   

                                        Cleaned_Text  
0  kalo nawarin promo sampe bertubi giliran uda p...  
1  akun kopongan september khusus strimingku aje ...  
2  keterlambatan ojk penanganan pinjol memprihati...  
3  mewakili korban pinjol setuju penyampaian ojk ...  
4  yth berdasarkan artikel yg per

In [29]:
print("Jumlah nilai NaN:", data_clean_df['Cleaned_Text'].isna().sum())

Jumlah nilai NaN: 56


In [30]:
data_clean_df = data_clean_df.dropna(subset=['Cleaned_Text'])

In [31]:
print("Jumlah nilai NaN:", data_clean_df['Cleaned_Text'].isna().sum())

Jumlah nilai NaN: 0


In [32]:
X = data_clean_df['Cleaned_Text']
y = data_clean_df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [33]:
model = SVC(kernel='linear') 
model.fit(X_train_tfidf, y_train)

In [34]:
y_pred = model.predict(X_test_tfidf)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))


Accuracy: 0.986138821799853
                precision    recall  f1-score   support

       negatif       0.98      0.72      0.83       115
        netral       0.99      1.00      0.99      8076
       positif       0.99      0.93      0.96      1278
sangat negatif       1.00      1.00      1.00         8
sangat positif       1.00      0.91      0.95        46

      accuracy                           0.99      9523
     macro avg       0.99      0.91      0.95      9523
  weighted avg       0.99      0.99      0.99      9523



In [20]:
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2)
grid.fit(X_train_tfidf, y_train)

print(grid.best_estimator_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ...............................C=0.1, kernel=linear; total time=  17.8s
[CV] END ...............................C=0.1, kernel=linear; total time=  13.5s
[CV] END ...............................C=0.1, kernel=linear; total time=  14.3s
[CV] END ...............................C=0.1, kernel=linear; total time=  12.6s
[CV] END ...............................C=0.1, kernel=linear; total time=  11.1s
[CV] END ..................................C=0.1, kernel=rbf; total time=  17.2s
[CV] END ..................................C=0.1, kernel=rbf; total time=  17.2s
[CV] END ..................................C=0.1, kernel=rbf; total time=  17.2s
[CV] END ..................................C=0.1, kernel=rbf; total time=  19.4s
[CV] END ..................................C=0.1, kernel=rbf; total time=  18.6s
[CV] END .................................C=1, kernel=linear; total time=   9.6s
[CV] END .................................C=1, ke

In [39]:
new_data = ["jujur ojk kerjaannya buruk"]
new_data_tfidf = vectorizer.transform(new_data)
prediction = model.predict(new_data_tfidf)
print(prediction)

['negatif']
