## Naive Bayes Topic Classifier<br>MDS201803<br>MDS201811

In [1]:
import numpy as np
import pandas as pd
import re

### Importing the data as one string

In [2]:
ls=''
for i in list(range(0,22)):
    if i<10:
        fl_name=r"C:/Users/LENOVO/Downloads/reuters21578/reut2-00"+str(0+i)+".sgm"
    else:
        fl_name=r"C:/Users/LENOVO/Downloads/reuters21578/reut2-0"+str(0+i)+".sgm"
    with open(fl_name,"r") as f:
        ls = ls + f.read()   
        

### Extracting the news Body using regular expression

In [3]:
body=[]
r=re.compile(r"<REUTERS([\S\s]*?)</TEXT>")
b_1=re.compile(r"<BODY>([\S\s]*?)</BODY>")
b_2=re.compile(r"<TITLE>([\S\s]*?)</TITLE>")
b_3=re.compile(r"&#2;([\S\s]*?)&#3;")
t_1=re.compile(r"<TEXT>")
t_2=re.compile(r'''<TEXT TYPE="BRIEF">''')
n=r.findall(ls)
for x in n:
    if t_1.findall(x)!=[]:
        body.append(b_1.findall(x))
    elif t_2.findall(x)!=[]:
        body.append(b_2.findall(x))
    else:
        body.append(b_3.findall(x))

### Extracting the Topics of news articles using regular expression

In [4]:
topic=[]
import re
p=re.compile(r"<TOPICS(>[<D>(\S)</D>]*<)/TOPICS>")
q=re.compile(r"<D>([\w-]*)</D>")
m = p.findall(ls)
for x in m:
    if x=='><':
        topic.append(["None"])
    else:
        topic.append(q.findall(x))

### Extracting predefined Type(Train-Test) of news article using regular expression 

In [5]:
train_test=[]
l=re.compile(r'''CGISPLIT=([\w"-]*)''')
train_test=l.findall(ls)

#### The following code excludes all news articles from the dataframe which have assigned

In [6]:
indices=[i for i,val in enumerate(topic) if val==['None'] ]
topic1 = topic[:]
for i in indices:
    topic.remove(topic1[i]) 
body1=body[:]
for i in indices:
    body.remove(body1[i])               
train_test1=train_test[:]
for i in indices:
    train_test.remove(train_test1[i])     

#### The function preprocess_string does the following:
<br>        1. everything apart from letters is excluded
<br>         2. multiple spaces are replaced by single space
<br>         3. str_arg is converted to lower case  

In [7]:
def preprocess_string(str_arg):
    cleaned_str=re.sub('[^a-z\s]+',' ',str_arg,flags=re.IGNORECASE) #every char except alphabets is replaced
    cleaned_str=re.sub('(\s+)',' ',cleaned_str) #multiple spaces are replaced by single space
    cleaned_str=cleaned_str.lower() #converting the cleaned string to lower case
    return cleaned_str # returning the preprocessed string 

In [8]:
Body=[]
for i in range(len(body)):
    Body.append((preprocess_string(body[i][0])))

In [9]:
df_tuples=list(zip(topic,Body,train_test))
df = pd.DataFrame(df_tuples, columns = ['Topics', 'Body', 'Train_test']) 

#### The following function replicates a news body corresponding to each assigned topics to it

In [10]:
def explode(df, lst_cols, fill_value='', preserve_index=False):
    # make sure `lst_cols` is list-alike
    if (lst_cols is not None
        and len(lst_cols) > 0
        and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)
    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()
    # preserve original index values    
    idx = np.repeat(df.index.values, lens)
    # create "exploded" DF
    res = (pd.DataFrame({
                col:np.repeat(df[col].values, lens)
                for col in idx_cols},
                index=idx)
             .assign(**{col:np.concatenate(df.loc[lens>0, col].values)
                            for col in lst_cols}))
    # append those rows that have empty lists
    if (lens == 0).any():
        # at least one list in cells is empty
        res = (res.append(df.loc[lens==0, idx_cols], sort=False)
                  .fillna(fill_value))
    # revert the original index order
    res = res.sort_index()
    # reset index if requested
    if not preserve_index:        
        res = res.reset_index(drop=True)
    return res

In [11]:
df_rep=explode(df,["Topics"], fill_value='', preserve_index=True)
len(df_rep)

14302

#### The following code assigns unique numeric labels to each of the topics 

In [12]:
from sklearn.preprocessing import LabelEncoder 
leb=LabelEncoder()
df_rep["Topic_Encoded"]=leb.fit_transform(df_rep['Topics'])

### Test train split of data according to predefined lables

In [13]:
Topic_train = df_rep[df_rep['Train_test']== '"TRAINING-SET"']["Topic_Encoded"] 
Topic_test = df_rep[df_rep['Train_test']!= '"TRAINING-SET"']["Topic_Encoded"] 
Body_train = df_rep[df_rep['Train_test']== '"TRAINING-SET"']["Body"] 
Body_test = df_rep[df_rep['Train_test']!= '"TRAINING-SET"']["Body"] 

### Test train split of data according to random allocation (80:20)

In [14]:
from sklearn.model_selection import train_test_split

X = df_rep['Body']
y = df_rep['Topic_Encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17)

print("Shape of X is {}".format(X.shape))
print("Shape of X_train is {} and shape of y_train is {}".format(X_train.shape, y_train.shape))
print("Shape of X_test is {} and shape of y_test is {}".format(X_test.shape, y_test.shape))

Shape of X is (14302,)
Shape of X_train is (10726,) and shape of y_train is (10726,)
Shape of X_test is (3576,) and shape of y_test is (3576,)


#### Here we use tfidf vectorizer to give more weightage to  relevant words in the news document

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
vectorizer = TfidfVectorizer()

### Multinomial Naive Bayes
#### The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

In [16]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()

In [17]:
pipeline_bayes = Pipeline([('vectorizer', TfidfVectorizer()),('classifier', naive_bayes)])

### Case1:

#### Topic_train,Body_train and Topic_test,Body_test are respectively training and testing datasets which are already defined as in the Reuters documentation


In [18]:
pipeline_bayes.fit(Body_train, Topic_train)
label_predicted = pipeline_bayes.predict(Body_test) #Prediction on testing data

In [19]:
#Accuracy of the model using Multinomial Naive Bayes. 

accuracy = accuracy_score(Topic_test, label_predicted)
print("Accuracy: ", accuracy)

Accuracy:  0.4206989247311828


### Case 2:

#### X_train,y_train and X_test,y_test are respectively training and testing datasets which are generated random allocation (80:20)

In [20]:
pipeline_bayes.fit(X_train, y_train) #Fitting the multinomial model to training data.
label_predicted = pipeline_bayes.predict(X_test) #testing on the test data.

In [21]:
#Accuracy using multinomial naive bayes

accuracy = accuracy_score(y_test, label_predicted)
print("Accuracy: ", accuracy)

Accuracy:  0.38814317673378074


### Complement Naive Bayes
#### The Complement Naive Bayes classifier was designed to correct the “severe assumptions” made by the standard Multinomial Naive Bayes classifier. It is particularly suited for imbalanced data sets.

In [22]:
from sklearn.naive_bayes import ComplementNB
naive_bayes_c = ComplementNB()

In [23]:
pipeline_bayes_c = Pipeline([('vectorizer', TfidfVectorizer()),('classifier', naive_bayes_c)])

### Case 1:

#### Topic_train,Body_train and Topic_test,Body_test are respectively training and testing datasets which are already defined as in the Reuters documentation


In [24]:
pipeline_bayes_c.fit(Body_train, Topic_train)
label_predicted = pipeline_bayes_c.predict(Body_test) #Prediction on testing data

In [25]:
#Accuracy of the model using Multinomial Naive Bayes. 

accuracy = accuracy_score(Topic_test, label_predicted)
print("Accuracy: ", accuracy)

Accuracy:  0.646505376344086


### Case 2

#### X_train,y_train and X_test,y_test are respectively training and testing datasets which are generated random allocation (80:20)

In [26]:
pipeline_bayes_c.fit(X_train, y_train) #Fitting the multinomial model to training data.
label_predicted = pipeline_bayes_c.predict(X_test) #testing on the test data.

In [27]:
#Accuracy using multinomial naive bayes

accuracy = accuracy_score(y_test, label_predicted)
print("Accuracy: ", accuracy)

Accuracy:  0.47175615212527966
