1. Import Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

 2. Load Dataset

In [None]:
df = pd.read_csv('dataset.csv')

print(df.head())
print(df.info())

3. Exploratory Data Analysis (EDA)

In [None]:
# Distribusi label
plt.figure()
df['label'].value_counts().plot(kind='bar')
plt.title('Distribusi Label Kesehatan Mental')
plt.show()

# Panjang teks
df['text_length'] = df['text'].astype(str).apply(len)

plt.figure()
plt.hist(df['text_length'], bins=30)
plt.title('Distribusi Panjang Teks')
plt.show()

4. Pre-processing Text

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)


df['clean_text'] = df['text'].astype(str).apply(clean_text)


5. Feature Engineering

In [None]:
X = df['clean_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Bag of Words
bow = CountVectorizer(max_features=5000)
X_train_bow = bow.fit_transform(X_train)
X_test_bow = bow.transform(X_test)

6. Training Model

In [None]:
# Logistic Regression (TF-IDF)
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_tfidf, y_train)

# Naive Bayes (BoW)
nb = MultinomialNB()
nb.fit(X_train_bow, y_train)

7. Evaluasi Model

In [None]:
# Logistic Regression
y_pred_lr = logreg.predict(X_test_tfidf)
print('Logistic Regression Accuracy:', accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

# Naive Bayes
y_pred_nb = nb.predict(X_test_bow)
print('Naive Bayes Accuracy:', accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

# Confusion Matrix Logistic Regression
cm = confusion_matrix(y_test, y_pred_lr)
plt.figure()
sns.heatmap(cm, annot=True, fmt='d')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()

8. Kesimpulan
# Logistic Regression dengan TF-IDF memberikan performa yang lebih stabil
# dibandingkan Naive Bayes, terutama pada data teks dengan variasi kata yang tinggi.