Note: Next exampes use 'scikit-learn' library.
If you are not using Anaconda, install them with
``pipenv install scikit-learn``


In [100]:
import numpy as np
import pandas as pd

import sklearn.datasets as skd

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

from sklearn import metrics
from sklearn.metrics import accuracy_score

import json


## Import and Analyze the data

In [102]:
# our json data is not a list of objects, so lets make it:
with open('../datasets/TextClassification/News_Category_Dataset_v2._prepared.json') as f:
    lines = f.readlines()
    joined_lines = '[' + ','.join(lines) + ']'
    json_data = json.loads(joined_lines)
    
df = pd.DataFrame(json_data)
df.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [106]:
# let's check how many documents we have per each category:
df.groupby('category').count()

Unnamed: 0_level_0,headline,authors,link,short_description,date
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ARTS,1509,1509,1509,1509,1509
ARTS & CULTURE,1339,1339,1339,1339,1339
BLACK VOICES,4528,4528,4528,4528,4528
BUSINESS,5937,5937,5937,5937,5937
COLLEGE,1144,1144,1144,1144,1144
COMEDY,5175,5175,5175,5175,5175
CRIME,3405,3405,3405,3405,3405
CULTURE & ARTS,1030,1030,1030,1030,1030
DIVORCE,3426,3426,3426,3426,3426
EDUCATION,1004,1004,1004,1004,1004


## Preprocess the data

For the sake of example, we will use only 2 categories: 'ARTS' and 'BUSINESS'. So let's create a new DataFrama containing only the rows for these categories

In [108]:
simple_df = df[ (df.category=='ARTS') | (df.category=='BUSINESS') ]

# we can save it as json, if we need:
# simple_df.to_json('../datasets/TextClassification/News_Category_Dataset_v2_simplified.json')

# check the categories:
simple_df.groupby('category').count()

Unnamed: 0_level_0,headline,authors,link,short_description,date
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ARTS,1509,1509,1509,1509,1509
BUSINESS,5937,5937,5937,5937,5937


In [79]:
df = pd.read_json('../datasets/TextClassification/News_Category_Dataset_v2_simplified.json')
# print(df.loc[df.category=='ARTS','short_description'])
# df.info()

In [88]:
# check if there are 'short_description' cells containing empty strings:
df[df.short_description==''].count()

category             1
headline             1
authors              1
link                 1
short_description    1
date                 1
dtype: int64

fill empty 'short_description' cells with 'headline' vallues

In [99]:
# remove lines with empry headline and short_descripion
df = df.loc[ (df.short_description!='') & (df.headline!='') ]

# now fill:
df.short_description = np.where(df.short_description=='', df.headline, df.short_description)

# check again:
df[df.short_description!=''].count()

category             7445
headline             7445
authors              7445
link                 7445
short_description    7445
date                 7445
dtype: int64

In [4]:
# https://www.kaggle.com/rmisra/news-category-dataset/data#

categories = ['sci.med', 'sci.space']
news_train = skd.load_files('../../datasets/TextClassification/20news-bydate-train/', categories= categories, encoding= 'ISO-8859-1')
news_test = skd.load_files('../../datasets/TextClassification/20news-bydate-test/',categories= categories, encoding= 'ISO-8859-1')

# look at the data structure:
print( f' news_train keys: {news_train.keys()}' )

# let's create a dataframe:
df_train = pd.DataFrame({
    'data': news_train.data,    
    'target':news_train.target,    
})

df_test = pd.DataFrame({
    'data': news_test.data,    
    'target':news_test.target,    
})

FileNotFoundError: [Errno 2] No such file or directory: '../../datasets/TextClassification/20news-bydate-train/'

In [4]:
df_train.head(3)

Unnamed: 0,data,target
0,From: weilej@cary115.its.rpi.edu (Jason Lee We...,0
1,From: ridout@bink.plk.af.mil (Brian S. Ridout...,0
2,From: harti@mikro.ee.tu-berlin.de (Stefan Hart...,0


In [5]:
df_test.head(3)

Unnamed: 0,data,target
0,From: Christopher.Vance@adfa.oz.au (Christophe...,3
1,From: grieggs@jpl-devvax.jpl.nasa.gov (John T....,0
2,From: lilley@v5.cgu.mcc.ac.uk (Chris Lilley)\n...,0


In [6]:
df_train.groupby('target').count()

Unnamed: 0_level_0,data
target,Unnamed: 1_level_1
0,584
1,594
2,593
3,599


In [7]:
df_test.groupby('target').count()

Unnamed: 0_level_0,data
target,Unnamed: 1_level_1
0,389
1,396
2,394
3,398


## Preprocessing (stop-words removal, stemming)

## TF-IDF

## Classification (Naive Bayes)

### Train the classifier

In [8]:
# instantiate the classifier:
clf = Pipeline([('vect', TfidfVectorizer()), 
                ('clf', MultinomialNB()) ])

clf.fit(news_train.data, news_train.target)

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

### Test the classifier

In [9]:
# Predict, using the test cases
predicted = clf.predict(news_test.data)

res = predicted == news_test.target

print( f'Correct: {np.count_nonzero( (res) )} out of {np.size(res)}' )
print( f'False: {np.size(res) - np.count_nonzero((res))} ' )


Correct: 1415 out of 1577
False: 162 


## Results estimation

In [10]:
accuracy = str(np.mean(predicted == news_test.target))
print(f'Accuracy = {accuracy}\n')

report = metrics.classification_report(news_test.target, predicted, target_names=news_test.target_names)
print(f'Classification report: \n{report}')

# metrics.confusion_matrix(news_test.target, predicted)

Accuracy = 0.8972733037412809

Classification report: 
                        precision    recall  f1-score   support

         comp.graphics       0.96      0.88      0.92       389
               sci.med       0.97      0.81      0.88       396
             sci.space       0.94      0.92      0.93       394
soc.religion.christian       0.78      0.98      0.87       398

              accuracy                           0.90      1577
             macro avg       0.91      0.90      0.90      1577
          weighted avg       0.91      0.90      0.90      1577

