In [1]:
import re
import os
import codecs
import math
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt

from mllab.NaiveBayes import NaiveBayes

In [2]:
"""Downloading the datasets in specific directory"""
!mkdir datasets/newsgroups
!curl -o datasets/newsgroups/news.tar.gz "http://qwone.com/~jason/20Newsgroups/20news-bydate.tar.gz"

"""Extract the files"""
!gzip -d < datasets/newsgroups/news.tar.gz | tar xf - --directory datasets/newsgroups

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 13.7M  100 13.7M    0     0  1699k      0  0:00:08  0:00:08 --:--:-- 2438k


In [3]:
"""Get all the directory and subdirectory in the folder"""
"""Count documents in each category"""
def showDirectory(path):
    global subdirs
    subdirs = [subdir for subdir in os.listdir(path)]
    print("No. of categories : ", len(subdirs))
    print("----------------------------------")
    sum = 0
    for subdir in sorted(subdirs):
        samples_per_category = len(os.listdir(os.path.join(path,subdir)))
        sum = sum + samples_per_category
        print("No. of samples: ", samples_per_category, " in ", subdir)
    print("----------------------------------")    
    print("Total number of samples ", sum)

train_path = 'datasets/newsgroups/20news-bydate-train'
test_path = 'datasets/newsgroups/20news-bydate-test'

showDirectory(train_path)
#showDirectory(test_path)

No. of categories :  20
----------------------------------
No. of samples:  480  in  alt.atheism
No. of samples:  584  in  comp.graphics
No. of samples:  591  in  comp.os.ms-windows.misc
No. of samples:  590  in  comp.sys.ibm.pc.hardware
No. of samples:  578  in  comp.sys.mac.hardware
No. of samples:  593  in  comp.windows.x
No. of samples:  585  in  misc.forsale
No. of samples:  594  in  rec.autos
No. of samples:  598  in  rec.motorcycles
No. of samples:  597  in  rec.sport.baseball
No. of samples:  600  in  rec.sport.hockey
No. of samples:  595  in  sci.crypt
No. of samples:  591  in  sci.electronics
No. of samples:  594  in  sci.med
No. of samples:  593  in  sci.space
No. of samples:  599  in  soc.religion.christian
No. of samples:  546  in  talk.politics.guns
No. of samples:  564  in  talk.politics.mideast
No. of samples:  465  in  talk.politics.misc
No. of samples:  377  in  talk.religion.misc
----------------------------------
Total number of samples  11314


# Extract and export datasets to pandas to csv

In [4]:
def CreateDataFrame(path, writepath):
    df = pd.DataFrame()
    global subdirs
    count = 0
    subdirs = [subdir for subdir in os.listdir(path)]
    for subdir in sorted(subdirs):
        filepath = os.path.join(path,subdir)
        uniquefile = [file for file in os.listdir(filepath)]
        for unique in uniquefile:
            finalpath = os.path.join(filepath, unique)
            with codecs.open(finalpath, encoding='latin1') as doc:
                doc = doc.read().lower()
                _header, _blank_line, body = doc.partition('\n\n')
                df = df.append({'body': body, 'news_category': subdir}, ignore_index=True)
    df.to_csv(writepath, index=False)

In [5]:
train_path = 'datasets/newsgroups/20news-bydate-train'
write_train = 'datasets/news_train.csv'
test_path = 'datasets/newsgroups/20news-bydate-test'
write_test = 'datasets/news_test.csv'

In [6]:
CreateDataFrame(train_path, write_train)
CreateDataFrame(test_path, write_test)

In [7]:
news_train = pd.read_csv('datasets/news_train.csv')
news_test = pd.read_csv('datasets/news_test.csv')

__convert text category to numerical form__

In [8]:
news_train['news_category'] = news_train['news_category'].astype('category')
news_train['news_category'] = news_train['news_category'].cat.codes

news_test['news_category'] = news_test['news_category'].astype('category')
news_test['news_category'] = news_test['news_category'].cat.codes

__cleaning data__

In [9]:
def clean_data(df):
    dfs = pd.DataFrame()
    dfs['text'] = df
    #convert text to lower case
    dfs['text']  = dfs['text'].str.lower()
    #remove punctuations
    #remove all non alphabetic characters
    regex_1 = re.compile(r'[^a-zA-Z]')
    dfs['text'] = pd.Series(dfs['text']).str.replace(regex_1, ' ')
    #reduce multi-space to single space
    regex_2 = re.compile(r' +')
    dfs['text'] = pd.Series(dfs['text']).str.replace(regex_2,' ')
    return dfs

In [10]:
news_train['body'] = clean_data(news_train['body'])
news_test['body'] = clean_data(news_test['body'])

__Convert to unicode__

In [11]:
trainx = news_train['body'].astype('U')
trainy = news_train['news_category']

testx = news_test['body'].astype('U')
testy = news_test['news_category']

__Convert to numpy array__

In [12]:
trainx = np.array(trainx)
trainy = np.array(trainy)

testx = np.array(testx)
testy = np.array(testy)

# Final magic : Naive Bayes :)

In [13]:
nb = NaiveBayes()
nb.train(trainx, trainy)

In [14]:
y_pred = nb.infer(testx)

In [15]:
from sklearn import metrics
print('Accuracy: %2.2f %%' % (100. * metrics.accuracy_score(testy, y_pred)))

Accuracy: 73.82 %
