In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# Unduh stopwords dan wordnet dari NLTK jika belum diunduh
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

import en_core_web_sm
nlp = en_core_web_sm.load()


[nltk_data] Downloading package stopwords to C:\Users\Lenovo Ideapad
[nltk_data]     320\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Lenovo Ideapad
[nltk_data]     320\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Lenovo Ideapad
[nltk_data]     320\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Memuat data dari file Excel
file_path = 'G:\Tugas Kuliah\Text Mining\Klasifikasi\data.xlsx'
data = pd.read_excel(file_path)

# Menampilkan beberapa baris pertama untuk memastikan data telah dimuat dengan benar
print(data.head())


   no                                              title  \
0   0  Saudi Arabia is trying to disrupt soccer’s wor...   
1   1  Former Augusta National Golf Club employee ple...   
2   2  Novak Djokovic named Laureus Sportsman of the ...   
3   3  Hyper-luxury golf carts that you can drive on ...   
4   4  Manchester City takes huge step toward fourth ...   

                                                text  \
0  \n\n\n\nCNN\n         — \n    \n\n\nEditor’s N...   
1  \n\n\n\nCNN\n         — \n    \n\n\n          ...   
2                                             kosong   
3  \n\n\n\nCNN\n         — \n    \n\n\n          ...   
4  \n\n\n\nCNN\n         — \n    \n\n\n          ...   

                                                imgs  \
0  ['https://media.cnn.com/api/v1/images/stellar/...   
1  ['https://media.cnn.com/api/v1/images/stellar/...   
2  ['https://media.cnn.com/api/v1/images/stellar/...   
3  ['https://media.cnn.com/api/v1/images/stellar/...   
4  ['https://media.cnn

In [3]:
def preprocess_text(text):
    # Menghapus karakter khusus dan angka
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', '', text)

    # Mengonversi teks ke huruf kecil
    text = text.lower()

    # Menghapus stop words
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]

    # Lemmatization
    lemmatized_tokens = [nlp(token)[0].lemma_ for token in stemmed_tokens]

    # Menggabungkan kembali token yang sudah diproses
    processed_text = ' '.join(lemmatized_tokens)

    return processed_text

In [4]:
data['processed_text'] = data['text'].apply(preprocess_text)

# Menampilkan beberapa baris pertama dari dataframe setelah preprocessing
print(data[['text', 'processed_text']].head())


                                                text  \
0  \n\n\n\nCNN\n         — \n    \n\n\nEditor’s N...   
1  \n\n\n\nCNN\n         — \n    \n\n\n          ...   
2                                             kosong   
3  \n\n\n\nCNN\n         — \n    \n\n\n          ...   
4  \n\n\n\nCNN\n         — \n    \n\n\n          ...   

                                      processed_text  
0  cnn editor note version stori appear cnn meanw...  
1  cnn former employe augusta nation golf club an...  
2                                             kosong  
3  cnn bloom azalea augusta nation breathtak ligh...  
4  cnn celebr manch citi win tottenham tuesday te...  


In [5]:
data['processed_text']

0      cnn editor note version stori appear cnn meanw...
1      cnn former employe augusta nation golf club an...
2                                                 kosong
3      cnn bloom azalea augusta nation breathtak ligh...
4      cnn celebr manch citi win tottenham tuesday te...
                             ...                        
341    cnn master basic bridgerton possibl sit back l...
342    cnn tenni love mean noth love also littl chall...
343    cnn magician david copperfield deni alleg sexu...
344    cnn time return middl earth prime video relea ...
345    cnn dabney coleman veteran film televis actor ...
Name: processed_text, Length: 346, dtype: object

In [6]:
data

Unnamed: 0,no,title,text,imgs,other_links,category,processed_text
0,0,Saudi Arabia is trying to disrupt soccer’s wor...,\n\n\n\nCNN\n — \n \n\n\nEditor’s N...,['https://media.cnn.com/api/v1/images/stellar/...,"['https://bleacherreport.com/nhl', 'https://bl...",sports,cnn editor note version stori appear cnn meanw...
1,1,Former Augusta National Golf Club employee ple...,\n\n\n\nCNN\n — \n \n\n\n ...,['https://media.cnn.com/api/v1/images/stellar/...,"['https://bleacherreport.com/nhl', 'https://bl...",sports,cnn former employe augusta nation golf club an...
2,2,Novak Djokovic named Laureus Sportsman of the ...,kosong,['https://media.cnn.com/api/v1/images/stellar/...,"['https://bleacherreport.com/nhl', 'https://bl...",sports,kosong
3,3,Hyper-luxury golf carts that you can drive on ...,\n\n\n\nCNN\n — \n \n\n\n ...,['https://media.cnn.com/api/v1/images/stellar/...,"['https://bleacherreport.com/nhl', 'https://bl...",sports,cnn bloom azalea augusta nation breathtak ligh...
4,4,Manchester City takes huge step toward fourth ...,\n\n\n\nCNN\n — \n \n\n\n ...,['https://media.cnn.com/api/v1/images/stellar/...,"['https://bleacherreport.com/nhl', 'https://bl...",sports,cnn celebr manch citi win tottenham tuesday te...
...,...,...,...,...,...,...,...
341,341,‘Bridgerton’ Season 3 review: The Netflix show...,\n\n\n\nCNN\n — \n \n\n\n ...,['https://media.cnn.com/api/v1/images/stellar/...,"['https://facebook.com/CNNent', 'https://twitt...",entertainment,cnn master basic bridgerton possibl sit back l...
342,342,‘Challengers’ review: Zendaya stars in directo...,\n\n\n\nCNN\n — \n \n\n\n ...,['https://media.cnn.com/api/v1/images/stellar/...,"['https://facebook.com/CNNent', 'https://twitt...",entertainment,cnn tenni love mean noth love also littl chall...
343,343,Magician David Copperfield denies sexual assau...,\n\n\n\nCNN\n — \n \n\n\n ...,['https://media.cnn.com/api/v1/images/stellar/...,"['https://facebook.com/CNNent', 'https://twitt...",entertainment,cnn magician david copperfield deni alleg sexu...
344,344,‘Lord of the Rings: The Rings of Power’ Season...,\n\n\n\nCNN\n — \n \n\n\n ...,['https://media.cnn.com/api/v1/images/stellar/...,"['https://facebook.com/CNNent', 'https://twitt...",entertainment,cnn time return middl earth prime video relea ...


TF-IDF dan Splitting Data

In [7]:
tfidf_vectorizer = TfidfVectorizer()
X = data.drop(['no','text','imgs','other_links','title'],axis=1)
y = data['category']

X = tfidf_vectorizer.fit_transform(data['processed_text'])  # Fitur TF-IDF
with open('tfidf.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)

# Splitting data into training and test set:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

<276x13258 sparse matrix of type '<class 'numpy.float64'>'
	with 70799 stored elements in Compressed Sparse Row format>

Pycaret untuk mencari rekomendasi metode klasifikasi dataset

In [8]:
from pycaret.datasets import get_data
from pycaret.classification import *

In [9]:
# Melakukan setup data untuk proses modeling dengan menggunakan data kita dan category sebagai target.
s = setup(data, target='category')

Unnamed: 0,Description,Value
0,Session id,1023
1,Target,category
2,Target type,Multiclass
3,Target mapping,"entertainment: 0, health: 1, politics: 2, sports: 3, style: 4, world: 5"
4,Original data shape,"(346, 7)"
5,Transformed data shape,"(346, 7)"
6,Transformed train set shape,"(242, 7)"
7,Transformed test set shape,"(104, 7)"
8,Numeric features,1
9,Categorical features,5


In [10]:
# Membandingkan 16 metode berbeda dengan data kita
best = s.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.9708,1.0,0.9708,0.9782,0.9706,0.9643,0.966,0.256
lr,Logistic Regression,0.8717,0.0,0.8717,0.8626,0.8543,0.8412,0.8473,1.684
gbc,Gradient Boosting Classifier,0.778,0.0,0.778,0.7792,0.7427,0.72,0.7451,1.203
lightgbm,Light Gradient Boosting Machine,0.7355,0.932,0.7355,0.7024,0.6791,0.6708,0.7005,0.572
dt,Decision Tree Classifier,0.6367,0.7803,0.6367,0.4913,0.5389,0.5567,0.6048,0.23
rf,Random Forest Classifier,0.6198,0.9756,0.6198,0.6007,0.5405,0.5308,0.5672,0.495
ada,Ada Boost Classifier,0.5992,0.0,0.5992,0.417,0.4778,0.4915,0.55,0.397
ridge,Ridge Classifier,0.5292,0.0,0.5292,0.4956,0.4327,0.4036,0.4492,0.228
nb,Naive Bayes,0.4548,0.6979,0.4548,0.259,0.3231,0.3262,0.3679,0.261
lda,Linear Discriminant Analysis,0.4138,0.0,0.4138,0.2911,0.2972,0.2932,0.3515,0.228


In [11]:
# Melihat model terbaik
s.predict_model(best)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,no,title,text,imgs,other_links,processed_text,category,prediction_label,prediction_score
323,323,Ilana Glazer and Michelle Buteau are both moms...,\n\n\n\nCNN\n — \n \n\n\n ...,['https://media.cnn.com/api/v1/images/stellar/...,"['https://facebook.com/CNNent', 'https://twitt...",cnn audienc forgive forget ilana glazer michel...,entertainment,entertainment,1.0
101,101,Ötzi the Iceman has the world’s oldest tattoos...,\n\nSign up for CNN’s Wonder Theory science ne...,['https://media.cnn.com/api/v1/images/stellar/...,"['https://facebook.com/cnnstyle', 'https://twi...",sign cnn wonder theori scienc newslett explor ...,style,style,1.0
25,25,FIFA to take legal advice on calls to suspend ...,\n\n\n\nCNN\n — \n \n\n\n ...,['https://media.cnn.com/api/v1/images/stellar/...,"['https://bleacherreport.com/nhl', 'https://bl...",cnn world soccer govern bodi fifa hold special...,sports,sports,1.0
76,76,Drag queen Nymphia Wind performs at Taiwan’s p...,"\n\n\nTaipei, Taiwan\nCNN\n — \n \n...",['https://media.cnn.com/api/v1/images/stellar/...,"['https://facebook.com/cnnstyle', 'https://twi...",taipei taiwan cnn taiwane drag queen nymphia w...,style,style,1.0
334,334,Jean Smart was ‘nervous’ comics wouldn’t buy h...,\n\n\n\nCNN\n — \n \n\n\n ...,['https://media.cnn.com/api/v1/images/stellar/...,"['https://facebook.com/CNNent', 'https://twitt...",cnn jean smart never do standup comedi take ro...,entertainment,entertainment,1.0
...,...,...,...,...,...,...,...,...,...
95,95,Photographer Platon sits presidents and protes...,\n\n\n\nCNN\n — \n \n\n\n ...,['https://media.cnn.com/api/v1/images/stellar/...,"['https://facebook.com/cnnstyle', 'https://twi...",cnn vladimir putin muammar gaddafi mark zucker...,style,style,1.0
269,269,Many high schools are curbing the use of AI. T...,\n\n\n\nCNN\n — \n \n\n\n ...,['https://media.cnn.com/api/v1/images/stellar/...,"['https://facebook.com/cnnbusiness', 'https://...",cnn princeton high school student tri combat r...,world,world,1.0
250,250,"Rouen, France: Police shoot dead armed attacke...",\n\n\nParis\nCNN\n — \n \n\n\n ...,['https://media.cnn.com/api/v1/images/stellar/...,"['https://facebook.com/CNN', 'https://twitter....",pari cnn polic shot dead arm attack tri set fi...,world,world,1.0
53,53,Manchester United wins Women’s FA Cup for firs...,\n\n\n\nCNN\n — \n \n\n\nManchester...,['https://media.cnn.com/api/v1/images/stellar/...,"['https://bleacherreport.com/nhl', 'https://bl...",cnn manch unit woman fa cup first time histori...,sports,sports,1.0


In [12]:
# menampilkan arameter terbaik untuk metode dan data kita
display(best)

In [13]:
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',metric_params=None, n_jobs=-1, n_neighbors=5, p=2, weights='uniform')

In [14]:
# Melatih model
knn.fit(X_train, y_train)
with open('knn.pkl', 'wb') as file:
    pickle.dump(knn, file)

In [15]:
# Memprediksi label untuk data uji
y_pred = knn.predict(X_test)

In [16]:
# Mencetak laporan klasifikasi dan akurasi
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

entertainment       0.86      0.67      0.75         9
       health       1.00      0.67      0.80         6
     politics       0.81      0.93      0.87        14
       sports       0.61      0.92      0.73        12
        style       0.88      0.75      0.81        20
        world       0.38      0.33      0.35         9

     accuracy                           0.74        70
    macro avg       0.76      0.71      0.72        70
 weighted avg       0.76      0.74      0.74        70



In [17]:
print("Accuracy Score:")
print(accuracy_score(y_test, y_pred))

Accuracy Score:
0.7428571428571429
