In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'analisis-sentimen-terkait-intensif-mobil-listrik:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F3598426%2F6260824%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240317%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240317T150921Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D146bdb09e4cee2558a6f37f14a2915bdf5d1f0b81a5cdcb56c93ec7103fab96c5165b802a7d795b34055bd6c50b6247baff2992f7a279a23db90fd17f6b16b67d8050cb53d485d5471f6103d72cf8bb36441ce35dc0a5a47fd0dc9fd95a9b86ace82dd0886be8061cd56a2a090786eb9befd3c80c22a432addcd935954fae408fa257b4c554a8d0f1b31da9564253ce8fbc2b9dc29e00209bea20b35f1b661415dfc48ee2fa001102d8d1913971b37e769ac9209f5b8f6d99bbd4019be73c5b231524a4121e759b04ee07b73a21ead71c5a6cb8a1b9a31ea704a876bdb49b7c72d5fe370402ad6134515b5f05ad2075be9b33fcb3451d2edeae294235fe027c0'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


# Import Dataset

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("/kaggle/input/analisis-sentimen-terkait-intensif-mobil-listrik/mobil_listrik.csv")

In [None]:
df.head()

# Visualisasi

### Pie Chart

In [None]:
# Menghitung jumlah sentimen positif, negatif, dan netral
positif_count = df[df['sentimen'] == 'positif'].shape[0]
negatif_count = df[df['sentimen'] == 'negatif'].shape[0]
netral_count = df[df['sentimen'] == 'netral'].shape[0]

# Menyusun data sentimen dan label
sentimen = [positif_count, negatif_count, netral_count]
labels = ['Positif', 'Negatif', 'Netral']

# Menentukan warna untuk setiap sektor
colors = ['#55a868', '#c44e52', '#4c72b0']

# Membuat Pie Chart
plt.pie(sentimen, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)

# Menambahkan judul
plt.title('Persentase Sentimen')

# Menampilkan Pie Chart
plt.axis('equal')
plt.show()

Dapat dilihat bahwa isi komentar lebih banyak pada label sentimen negatif. Ini berarti bahwa banyak masyarakat yang merasa tidak setuju atau tidak mendukung adanya pemberlakukan intensif mobil listrik di Indonesia.

### Wordcloud

In [None]:
from wordcloud import WordCloud

In [None]:
# Mengganti nilai NaN dengan string kosong ('') pada kolom "text_cleaning"
df['text_cleaning'] = df['text_cleaning'].fillna('')

# Membuat subset data untuk tiap kategori sentimen
df_neutral = df[df['sentimen'] == 'netral']
df_negative = df[df['sentimen'] == 'negatif']
df_positive = df[df['sentimen'] == 'positif']

# Menggabungkan semua kata dalam kolom "text_cleaning" untuk tiap kategori sentimen
text_neutral = ' '.join(df_neutral['text_cleaning'])
text_negative = ' '.join(df_negative['text_cleaning'])
text_positive = ' '.join(df_positive['text_cleaning'])

# Membuat objek WordCloud untuk tiap kategori sentimen
wordcloud_neutral = WordCloud(width=800, height=400).generate(text_neutral)
wordcloud_negative = WordCloud(width=800, height=400).generate(text_negative)
wordcloud_positive = WordCloud(width=800, height=400).generate(text_positive)

# Menampilkan wordcloud untuk tiap kategori sentimen secara terpisah
plt.figure(figsize=(8, 4))
plt.imshow(wordcloud_neutral, interpolation='bilinear')
plt.title('Wordcloud - Sentimen Netral')
plt.axis('off')
plt.show()

plt.figure(figsize=(8, 4))
plt.imshow(wordcloud_negative, interpolation='bilinear')
plt.title('Wordcloud - Sentimen Negatif')
plt.axis('off')
plt.show()

plt.figure(figsize=(8, 4))
plt.imshow(wordcloud_positive, interpolation='bilinear')
plt.title('Wordcloud - Sentimen Positif')
plt.axis('off')
plt.show()

Dari visualisasi diatas merupakan wordcloud kata yang paling banyak muncul pada isi komentar yang memiliki label sentimen negatif. Kata yang paling sering muncul dan mengarah ke ulasan negatif membahas seputar : beli, subsidi, kendaraan, banyak dan sebagainya. Sehingga dari kata-kata ini bisa menjadi masukan untuk kebijakan "Intensif Mobil Listrik" untuk memeperhatikan kebutuhan masayarakat (subsidi, dan kendaraan), harga beli, dan lainnya.

Dari visualisasi diatas merupakan wordcloud kata yang paling banyak muncul pada isi komentar yang memiliki label sentimen positif. Kata yang paling sering muncul dan mengarah ke ulasan positif membahas seputar : subsidi, harga, ev, mahal dan sebagainya.

# Menyiapkan Data

In [None]:
COL = 'text_cleaning'

In [None]:
print(COL)
df.shape

In [None]:
df.head()

In [None]:
# Membuat mapping untuk encoding
sentimen_mapping = {"negatif": -1, "netral": 0, "positif": 1}

# Melakukan encoding pada kolom "sentimen"
df["sentimen_encoded"] = df["sentimen"].map(sentimen_mapping)

# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(min_df=3, max_df=0.5, ngram_range=(1, 1))
features = tfidf.fit_transform(df[COL].values)
dfuni = pd.DataFrame(features.todense(), columns=tfidf.get_feature_names_out())

In [None]:
print(dfuni.shape)

In [None]:
dfuni.head()

# Split Data

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import tree, metrics
from sklearn.metrics import classification_report

In [None]:
dfuni['y'] = df['sentimen_encoded']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dfuni.drop(['y'], axis=1), dfuni[['y']], test_size=0.30, random_state=555)

Pada proses ini akan digunakan library sklearn.model_selection dengan modul train_test _split untuk membagi data latih (X_train dan y_train) dan data uji (X_test dan y_test) dengan persentasi data latih 70% dan data uji 30% serta memilih label data yaitu yang merupakan variable independen dari data yaitu kolom sentimen untuk dijadikan parameter klasifikasi prediksi.

In [None]:
dfunitrain = X_train.join(y_train)
dfunitest = X_test.join(y_test)

In [None]:
X_unitrain = dfunitrain.drop(['y'], axis=1)
y_unitrain = dfunitrain['y']
X_unitest = dfunitest.drop(['y'], axis=1)
y_unitest = dfunitest['y']

# SVM

In [None]:
from sklearn import svm

In [None]:
clf = svm.SVC(max_iter=-1,
            C=1000,
#             gamma=0.01,
            kernel='rbf')
# clf = svm.NuSVC(kernel='poly')
clf.fit(X_unitrain, y_unitrain.values.ravel())
y_unipred = clf.predict(X_unitest)
print(clf.score(X_unitest, y_unitest))
print(metrics.classification_report(y_unitest, y_unipred, digits=3))

Berdasarkan output tersebut, model SVM memiliki akurasi prediksi sebesar 78% untuk kasus "Analisis Sentimen Terkait Intensif Mobil Listrik" menggunakan data komentar pada platform Youtube

Author:BILIARTO SASTRO CEMERSON

https://www.kaggle.com/code/billycemerson/analisis-sentimen-intensif-mobil-listrik-svm#Split-Data