Note: Next exampes use 'scikit-learn' library.
If you are not using Anaconda, install them with
``pipenv install scikit-learn``


In [51]:
import numpy as np
import pandas as pd

import sklearn.datasets as skd

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split


from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import json


## Import and Analyze the data

In [52]:
# our json data is not a list of objects, so lets make it:
with open('../../datasets/TextClassification/News_Category_Dataset_v2.json') as f:
    lines = f.readlines()
    joined_lines = '[' + ','.join(lines) + ']'
    json_data = json.loads(joined_lines)
    
full_df = pd.DataFrame(json_data)
full_df.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [53]:
# let's check how many documents we have per each category:
full_df.groupby('category').count()

Unnamed: 0_level_0,headline,authors,link,short_description,date
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ARTS,1509,1509,1509,1509,1509
ARTS & CULTURE,1339,1339,1339,1339,1339
BLACK VOICES,4528,4528,4528,4528,4528
BUSINESS,5937,5937,5937,5937,5937
COLLEGE,1144,1144,1144,1144,1144
COMEDY,5175,5175,5175,5175,5175
CRIME,3405,3405,3405,3405,3405
CULTURE & ARTS,1030,1030,1030,1030,1030
DIVORCE,3426,3426,3426,3426,3426
EDUCATION,1004,1004,1004,1004,1004


## Preprocess the data

For the sake of example, we will use only 2 categories: 'ARTS' and 'BUSINESS'. So let's create a new DataFrama containing only the rows for these categories

In [54]:
df = full_df[ (full_df.category=='ARTS') | (full_df.category=='BUSINESS') ]

# we can save it as json, if we need:
# simple_df.to_json('../datasets/TextClassification/News_Category_Dataset_v2_simplified.json')

# check the categories:
df.groupby('category').count()

Unnamed: 0_level_0,headline,authors,link,short_description,date
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ARTS,1509,1509,1509,1509,1509
BUSINESS,5937,5937,5937,5937,5937


In [55]:
# df = pd.read_json('../datasets/TextClassification/News_Category_Dataset_v2_simplified.json')
# print(df.loc[df.category=='ARTS','short_description'])
# df.info()

In [56]:
# check if there are 'short_description' cells containing empty strings:
df[df.short_description==''].count()

category             1506
headline             1506
authors              1506
link                 1506
short_description    1506
date                 1506
dtype: int64

fill empty 'short_description' cells with 'headline' vallues

In [57]:
# remove lines with empty headline and short_descripion
df = df.loc[ (df.short_description!='') & (df.headline!='') ]

# now fill:
df.short_description = np.where(df.short_description=='', df.headline, df.short_description)

# check again:
df[df.short_description==''].count()

category             0
headline             0
authors              0
link                 0
short_description    0
date                 0
dtype: int64

### Select Features and separate the train/test sets

In [58]:
# https://www.kaggle.com/rmisra/news-category-dataset/data#

df[ ['short_description','category' ] ]

X_train, X_test, y_train, y_test = train_test_split(
    df['short_description'],
    df['category'],
    random_state=42)

## Preprocessing (stop-words removal, stemming)

## TF-IDF

## Classification (Naive Bayes)

### Train the classifier

In [59]:
# instantiate the classifier:
clf = Pipeline([('vect', TfidfVectorizer()), 
                ('clf', MultinomialNB()) ])

clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

### Test the classifier

In [62]:
# Predict, using the test cases
y_pred = clf.predict(X_test)

res = y_pred == y_test

print( f'Correct: {np.count_nonzero( (res) )} out of {np.size(res)}' )
print( f'False: {np.size(res) - np.count_nonzero((res))} ' )


Correct: 1279 out of 1485
False: 206 


## Results estimation

In [63]:
accuracy = str(np.mean(y_pred == y_test))
print(f'Accuracy = {accuracy}\n')

report = metrics.classification_report(y_test, y_pred, target_names=['ARTS','BUSINESS'])

print(f'Classification report: \n{report}')

Accuracy = 0.8612794612794613

Classification report: 
              precision    recall  f1-score   support

        ARTS       1.00      0.00      0.01       207
    BUSINESS       0.86      1.00      0.93      1278

    accuracy                           0.86      1485
   macro avg       0.93      0.50      0.47      1485
weighted avg       0.88      0.86      0.80      1485



In [75]:
confusion_matrix(y_test, y_pred, labels=['ARTS','BUSINESS'])

array([[   1,  206],
       [   0, 1278]])

<img src="../../images/
Confusion-matrix-and-Metrics.png">