In [3]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [4]:
data = pd.read_csv('berita.csv')

In [5]:
# Drop data kosong
data = data.dropna(subset=['judul', 'konten', 'kategori'])

# Drop duplikat
data = data.drop_duplicates()

In [6]:
# Filter kategori valid
valid_kategori = ['finance', 'inet', 'sport', 'oto', 'travel', 'food', 'health', 'edu', 'properti']
data = data[data['kategori'].isin(valid_kategori)]

In [7]:
# Normalisasi teks: lowercasing, removing numbers, punctuation, and special characters
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

data['judul'] = data['judul'].apply(clean_text)
data['konten'] = data['konten'].apply(clean_text)

In [8]:
# Tokenization, Stopword Removal, and Stemming
stop_words = set(stopwords.words('indonesian'))
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

In [9]:
def preprocess_text(text):
    tokens = word_tokenize(text)  # Tokenization
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    stemmed_tokens = [stemmer.stem(word) for word in tokens]  # Stemming menggunakan Sastrawi
    return ' '.join(stemmed_tokens)

data['judul'] = data['judul'].apply(preprocess_text)
data['konten'] = data['konten'].apply(preprocess_text)

In [10]:
# Convert 'tanggal' ke datetime
data['tanggal'] = pd.to_datetime(data['tanggal'], errors='coerce')
data = data.dropna(subset=['tanggal'])

  data['tanggal'] = pd.to_datetime(data['tanggal'], errors='coerce')


In [11]:
# Combine 'judul' and 'konten' into a single text feature
data['text'] = data['judul'] + " " + data['konten']
X = data['text']
y = data['kategori']

In [15]:
# Convert text data to numerical data using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features
X = vectorizer.fit_transform(X)

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [13]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)

NameError: name 'X_tfidf' is not defined

In [None]:
# Model Training
# KNN Classifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# SVM Classifier
svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train, y_train)

# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
# Model Evaluation
def evaluate_model(model, X_test, y_test, model_name):
    print(f"Evaluation for {model_name}:")
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print("-" * 60)

# Evaluate each model
evaluate_model(knn_model, X_test, y_test, "KNN")
evaluate_model(svm_model, X_test, y_test, "SVM")
evaluate_model(rf_model, X_test, y_test, "Random Forest")
