# Import statements

In [1]:
#imports
import pandas as pd
df = pd.read_csv('newdatasetfinal.csv')
df.head(5)

Unnamed: 0,title,NewCategory
0,Bitcoin is down 60 percent this year. Here's w...,Business & Finance
1,6 health problems marijuana could treat better...,Crime
2,9 charts that explain the history of global we...,Business & Finance
3,Remember when legal marijuana was going to sen...,Crime
4,Obamacare succeeded for one simple reason: it'...,Technology & Health


In [2]:
print('The shape is:', df.shape)

The shape is: (16798, 2)


In [4]:
#creating category_id feature
from io import StringIO
col = ['NewCategory', 'title']
df = df[col]
df = df[pd.notnull(df['title'])]
df.columns = ['NewCategory', 'title']
df['category_id'] = df['NewCategory'].factorize()[0]
category_id_df = df[['NewCategory', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'NewCategory']].values)
df.head()

Unnamed: 0,NewCategory,title,category_id
0,Business & Finance,Bitcoin is down 60 percent this year. Here's w...,0
1,Crime,6 health problems marijuana could treat better...,1
2,Business & Finance,9 charts that explain the history of global we...,0
3,Crime,Remember when legal marijuana was going to sen...,1
4,Technology & Health,Obamacare succeeded for one simple reason: it'...,2


# Creating new features

In [5]:
#creating numeric features for title
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.title).toarray()
labels = df.category_id
features.shape

(16798, 5527)

In [6]:
#Terms in the form of unigram and bigram that are most correlated to each category
from sklearn.feature_selection import chi2
import numpy as np
N = 2
for title, category in sorted(category_to_id.items()):
    features_chi2 = chi2(features, labels == category)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(category))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# '0':
  . Most correlated unigrams:
. bank
. stocks
  . Most correlated bigrams:
. men wearhouse
. jos bank
# '1':
  . Most correlated unigrams:
. ferguson
. police
  . Most correlated bigrams:
. sex marriage
. supreme court
# '4':
  . Most correlated unigrams:
. justin
. bieber
  . Most correlated bigrams:
. game thrones
. justin bieber
# '3':
  . Most correlated unigrams:
. donald
. trump
  . Most correlated bigrams:
. hillary clinton
. donald trump
# '2':
  . Most correlated unigrams:
. snowden
. titanfall
  . Most correlated bigrams:
. flappy bird
. edward snowden


# Spiltting the Data into Training and Testing

In [7]:
#Training and Testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features,labels, test_size = 0.2, random_state = 0)

# Training Decision Tree Classifier

In [8]:
from sklearn.neighbors import KNeighborsClassifier  
classifier = KNeighborsClassifier(n_neighbors=2)  
classifier.fit(X_train, y_train)  

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=2, p=2,
           weights='uniform')

In [9]:
y_pred = classifier.predict(X_test)

In [10]:

from sklearn.metrics import accuracy_score 
acc = accuracy_score(y_test, y_pred)
print(acc)

0.5077380952380952


# Hyperparameter Tuning

In [20]:

classifier = KNeighborsClassifier(n_neighbors=35)  
classifier.fit(X_train, y_train)  

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=35, p=2,
           weights='uniform')

In [21]:
y_pred = classifier.predict(X_test)

1. Accuracy

In [19]:
from sklearn.metrics import accuracy_score 
acc = accuracy_score(y_test, y_pred)
print("Accuracy of KNN Classifier after tuning:", acc*100)

0.7827380952380952


2. F1 Score

In [22]:
from sklearn.metrics import f1_score
print("F1 Score for all the categories:")
f1_score(y_test, y_pred, average=None)

F1 Score for all the categories:


array([0.84378109, 0.61138211, 0.76964406, 0.83236515, 0.78934221])

3. Confusion Matrix

In [23]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 424,    4,   39,   38,   21],
       [   8,  188,   51,  101,   42],
       [  16,    6,  573,   40,   25],
       [  22,   23,   97, 1003,   41],
       [   9,    4,   69,   42,  474]], dtype=int64)

4. Precision-Recall

In [24]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.81      0.84       526
           1       0.84      0.48      0.61       390
           2       0.69      0.87      0.77       660
           3       0.82      0.85      0.83      1186
           4       0.79      0.79      0.79       598

   micro avg       0.79      0.79      0.79      3360
   macro avg       0.80      0.76      0.77      3360
weighted avg       0.80      0.79      0.79      3360



References
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
https://medium.com/@mohtedibf/in-depth-parameter-tuning-for-knn-4c0de485baf6