# **Import package**

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from matplotlib.pyplot import *

# **Load data newsgroups from sklearn**

In [None]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

In [None]:
len(newsgroups.data)

## **Check info data**

In [None]:
newsgroups.target_names

In [None]:
newsgroups.data[0]

In [None]:
newsgroups.target[0]

In [None]:
data = newsgroups.data
label = newsgroups.target

In [None]:
print(type(newsgroups.data))
print(type(newsgroups.target))

## **Split data**

In [None]:
train_data, test_data, train_label, test_label = train_test_split(data, label, test_size = 0.275, shuffle = True)

In [None]:
print("data", train_data[0])
print("Label ", train_label[0])

# **Build dictionary and TF-IDF vector**

In [None]:
vectorizer = TfidfVectorizer()
train_TF_IDF = vectorizer.fit_transform(train_data)
test_TF_IDF = vectorizer.transform(test_data)

In [None]:
print(train_TF_IDF.shape)
print(test_TF_IDF.shape)

In [None]:
print(train_TF_IDF)

In [None]:
print(train_TF_IDF.shape)
print(test_TF_IDF.shape)

# **Training**

## **Naive Bayes**

In [None]:
NB = MultinomialNB(alpha=.01)
NB.fit(train_TF_IDF, train_label)
pred = NB.predict(test_TF_IDF)
f1_score = metrics.f1_score(test_label, pred, average='macro')
print(f1_score)

## **K-nearest neighors**

In [None]:
neigh = KNeighborsClassifier(n_neighbors=10)
neigh.fit(train_TF_IDF, train_label)
pred = neigh.predict(test_TF_IDF)
f1_score = metrics.f1_score(test_label, pred, average='macro')
print(f1_score)

## **Run with size train data difference**

In [None]:
result = {
    "KNN": [], # K-Nearest Neighors
    "NB": [],  # Naive Bayes
}
sizes = [1000, 3000, 5000, 7000, 9000, 11000, 13663]

# Define classify model
neigh = KNeighborsClassifier(n_neighbors=10)
NB = MultinomialNB(alpha=.01)

for size in sizes:
  # Extract TF-IDF
  train_TF_IDF_cr = train_TF_IDF[:size,:]
  train_label_cr = train_label[: size]
  
  # Train with KNN
  print('[INFO] Cls: KNN, size_train: {}, K = 10 '.format(size))
  neigh.fit(train_TF_IDF_cr, train_label_cr)
  pred_KNN = neigh.predict(test_TF_IDF)
  f1_score_KNN = metrics.f1_score(test_label, pred_KNN, average='macro')
  print('[INFO] Cls: KNN, size_train: {}, K = 10, f1-score: {}'.format(size, f1_score_KNN))
  result['KNN'].append(f1_score_KNN)

  # Train with NB 
  
  print('[INFO] Cls: NB, size_train: {}'.format(size))
  NB.fit(train_TF_IDF_cr, train_label_cr)
  pred_NB = NB.predict(test_TF_IDF)
  f1_score_NB = metrics.f1_score(test_label, pred_NB, average='macro')
  print('[INFO] Cls: NB, size_train: {}, f1-score: {}'.format(size, f1_score_NB))
  result['NB'].append(f1_score_NB)




# **Draw curve**

In [None]:
print(result)

In [None]:
f1_score_KNNs = result["KNN"]
f1_score_NBs = result["NB"]

plot(sizes, f1_score_KNNs, 'r-')
plot(sizes, f1_score_NBs, 'g-')
xlabel('Size train')
ylabel('f1-score')
legend(['KNN', 'NB'])
title('Compare f1-score KNN and NB')
show()