In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

# load dataset
dataset = fetch_20newsgroups()
x, y = dataset.data, dataset.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=45)

vectorizer = TfidfVectorizer(stop_words="english")
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)


model = KNeighborsClassifier(n_neighbors = 3)
model.fit(x_train, y_train)

pred = model.predict(x_test)

print(metrics.classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.78      0.91      0.84       129
           1       0.54      0.74      0.63       162
           2       0.56      0.77      0.65       128
           3       0.60      0.67      0.64       163
           4       0.71      0.65      0.68       167
           5       0.73      0.70      0.71       142
           6       0.61      0.45      0.52       150
           7       0.79      0.75      0.77       150
           8       0.84      0.88      0.86       146
           9       0.86      0.84      0.85       153
          10       0.89      0.86      0.88       157
          11       0.93      0.92      0.92       137
          12       0.83      0.62      0.71       149
          13       0.93      0.81      0.87       133
          14       0.89      0.89      0.89       151
          15       0.86      0.85      0.85       118
          16       0.90      0.89      0.89       150
          17       0.88    

array([7, 4, 4, ..., 3, 1, 8])

In [10]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertModel
import torch
from sklearn.neighbors import  KNeighborsClassifier
from sklearn.model_selection import  train_test_split
from sklearn import  metrics

# Tải tập dữ liệu
dataset = fetch_20newsgroups()
X, y = dataset.data, dataset.target

# Chia dữ liệu thành tập train và tập test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=45)

# # Biểu diễn dữ liệu dưới dạng TF-IDF
# vectorizer = TfidfVectorizer(stop_words='english')
# X_train = vectorizer.fit_transform(X_train)
# X_test = vectorizer.transform(X_test)

# Biểu diễn dữ liệu dưới dạng BERT
model = BertModel.from_pretrained('bert-base-uncased') # Tải pre-trained model
bertTokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Tải pre-trained tokenizer

# Tokenize và mã hóa dữ liệu
X_train = [bertTokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512) for text in X_train]
X_test = [bertTokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512) for text in X_test]

with torch.no_grad():  # Tắt tính năng tính gradient
    X_train = [model(**text).last_hidden_state[:, 0, :].numpy() for text in X_train]
    X_test = [model(**text).last_hidden_state[:, 0, :].numpy() for text in X_test]

X_train = np.array(X_train)
X_test = np.array(X_test)


# Huấn luyện mô hình
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)

# Dự đoán dữ liệu test
pred = model.predict(X_test)

# Hiển thị kết quả
print(metrics.classification_report(y_test,pred))
print("Accuray: ", metrics.accuracy_score(y_test, pred))
print(metrics.classification_report(y_test, pred, target_names=dataset.target_names))

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

#### Decision Tree

In [32]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

# load dataset
dataset = fetch_20newsgroups()
x, y = dataset.data, dataset.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=45)

vectorizer = CountVectorizer(stop_words="english", ngram_range=(1, 1))
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)


model = DecisionTreeClassifier(max_depth=90)
model.fit(x_train, y_train)

pred = model.predict(x_test)

print(metrics.classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.76      0.60      0.68       129
           1       0.69      0.55      0.61       162
           2       0.65      0.64      0.64       128
           3       0.68      0.47      0.56       163
           4       0.76      0.58      0.66       167
           5       0.80      0.61      0.69       142
           6       0.78      0.69      0.73       150
           7       0.72      0.55      0.62       150
           8       0.91      0.76      0.83       146
           9       0.71      0.69      0.70       153
          10       0.91      0.68      0.78       157
          11       0.87      0.80      0.83       137
          12       0.22      0.62      0.32       149
          13       0.37      0.72      0.49       133
          14       0.79      0.66      0.72       151
          15       0.71      0.76      0.74       118
          16       0.77      0.67      0.72       150
          17       0.88    

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

# load dataset
dataset = fetch_20newsgroups()
x, y = dataset.data, dataset.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=45)

vectorizer = CountVectorizer(stop_words="english")
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)


model = DecisionTreeClassifier(max_depth=100)
model.fit(x_train, y_train)

pred = model.predict(x_test)

print(metrics.classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.69      0.60      0.64       129
           1       0.61      0.56      0.59       162
           2       0.59      0.66      0.62       128
           3       0.53      0.50      0.52       163
           4       0.66      0.60      0.63       167
           5       0.67      0.60      0.63       142
           6       0.70      0.72      0.71       150
           7       0.48      0.62      0.54       150
           8       0.81      0.77      0.79       146
           9       0.61      0.73      0.66       153
          10       0.82      0.70      0.76       157
          11       0.80      0.80      0.80       137
          12       0.43      0.48      0.45       149
          13       0.71      0.71      0.71       133
          14       0.74      0.72      0.73       151
          15       0.69      0.81      0.75       118
          16       0.78      0.67      0.72       150
          17       0.85    

In [72]:
import numpy as np
def BagofWord(input):
    sentences = []
    input = input.split('.')

    for senten in input:
        sen = senten.split(',')
        for i in sen:
            sentences.append(i)
    for i in sentences:
        if i == '':
            sentences.remove(i)

    bagofword = CountVectorizer()
    result = bagofword.fit_transform(sentences)
    index_token = bagofword.get_feature_names_out()
    return result.toarray()
result = BagofWord("Atoms of radioactive elements can split")
# result = np.array(result[0])
extra_columns = np.zeros((result.shape[0], 113167))
result = np.hstack((result, extra_columns))

model.predict(result)

array([7])