In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# T·∫£i stopwords ti·∫øng Vi·ªát n·∫øu ch∆∞a c√≥
nltk.download('stopwords')
vietnamese_stopwords = set(stopwords.words('english'))  # Ch∆∞a c√≥ stopwords ti·∫øng Vi·ªát s·∫µn

# üìå B∆∞·ªõc 1: ƒê·ªçc d·ªØ li·ªáu
file_path = "/content/drive/MyDrive/BTL_DataMining/Data_BTL/shuffled_data_vnexpress_titles.csv"  # ƒê·ªïi ƒë∆∞·ªùng d·∫´n n·∫øu c·∫ßn
df = pd.read_csv(file_path)

# üìå B∆∞·ªõc 2: Ti·ªÅn x·ª≠ l√Ω d·ªØ li·ªáu
def preprocess_text(text):
    text = text.lower()  # Chuy·ªÉn th√†nh ch·ªØ th∆∞·ªùng
    text = re.sub(r'\d+', '', text)  # Lo·∫°i b·ªè s·ªë
    text = re.sub(r'[^\w\s]', '', text)  # Lo·∫°i b·ªè k√Ω t·ª± ƒë·∫∑c bi·ªát
    text = ' '.join([word for word in text.split() if word not in vietnamese_stopwords])  # Lo·∫°i b·ªè stopwords
    return text

df.dropna(inplace=True)  # X√≥a d√≤ng c√≥ gi√° tr·ªã tr·ªëng
df['Title'] = df['Title'].astype(str).apply(preprocess_text)  # √Åp d·ª•ng x·ª≠ l√Ω

# üìå B∆∞·ªõc 3: Chia d·ªØ li·ªáu th√†nh t·∫≠p train v√† test
X = df['Title']
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# üìå B∆∞·ªõc 4: Bi·ªÉu di·ªÖn vƒÉn b·∫£n th√†nh vector TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # Ch·ªâ gi·ªØ 5000 t·ª´ quan tr·ªçng nh·∫•t
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# üìå B∆∞·ªõc 5: Hu·∫•n luy·ªán m√¥ h√¨nh SVM
svm_model = SVC(kernel='linear')  # D√πng kernel tuy·∫øn t√≠nh cho b√†i to√°n ph√¢n lo·∫°i vƒÉn b·∫£n
svm_model.fit(X_train_tfidf, y_train)

# üìå B∆∞·ªõc 6: D·ª± ƒëo√°n tr√™n t·∫≠p ki·ªÉm tra
y_pred = svm_model.predict(X_test_tfidf)

# üìå B∆∞·ªõc 7: ƒê√°nh gi√° m√¥ h√¨nh
accuracy = accuracy_score(y_test, y_pred)
print(f"üéØ Accuracy: {accuracy:.4f}")
print("üìä Classification Report:\n", classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


üéØ Accuracy: 0.7241
üìä Classification Report:
               precision    recall  f1-score   support

   C√¥ng ngh·ªá       0.80      0.80      0.80        10
    Gi√°o d·ª•c       1.00      0.78      0.88         9
    Gi·∫£i tr√≠       0.78      0.58      0.67        12
  Kinh doanh       0.43      0.67      0.52         9
    S·ª©c kh·ªèe       0.88      0.64      0.74        11
    Th·ªÉ thao       0.70      1.00      0.82         7

    accuracy                           0.72        58
   macro avg       0.76      0.74      0.74        58
weighted avg       0.77      0.72      0.73        58

