## Naive Bayes Topic Classifier<br>MDS201803<br>MDS201811

In [1]:
import numpy as np
import pandas as pd
import re

### Importing the data as one string

In [2]:
ls=''
for i in list(range(0,22)):
    if i<10:
        fl_name=r"C:/Users/LENOVO/Downloads/reuters21578/reut2-00"+str(0+i)+".sgm"
    else:
        fl_name=r"C:/Users/LENOVO/Downloads/reuters21578/reut2-0"+str(0+i)+".sgm"
    with open(fl_name,"r") as f:
        ls = ls + f.read()   
        

### Extracting the news Body using regular expression

In [3]:
body=[]
r=re.compile(r"<REUTERS([\S\s]*?)</TEXT>")
b_1=re.compile(r"<BODY>([\S\s]*?)</BODY>")
b_2=re.compile(r"<TITLE>([\S\s]*?)</TITLE>")
b_3=re.compile(r"&#2;([\S\s]*?)&#3;")
t_1=re.compile(r"<TEXT>")
t_2=re.compile(r'''<TEXT TYPE="BRIEF">''')
n=r.findall(ls)
for x in n:
    if t_1.findall(x)!=[]:
        body.append(b_1.findall(x))
    elif t_2.findall(x)!=[]:
        body.append(b_2.findall(x))
    else:
        body.append(b_3.findall(x))

### Extracting the Topics of news articles using regular expression

In [4]:
topic=[]
import re
p=re.compile(r"<TOPICS(>[<D>(\S)</D>]*<)/TOPICS>")
q=re.compile(r"<D>([\w-]*)</D>")
m = p.findall(ls)
for x in m:
    if x=='><':
        topic.append(["None"])
    else:
        topic.append(q.findall(x))

### Extracting predefined Type(Train-Test) of news article using regular expression 

In [5]:
train_test_c=[]
l=re.compile(r'''CGISPLIT=([\w"-]*)''')
train_test_c=l.findall(ls)

In [6]:
train_test_l=[]
l=re.compile(r'''LEWISSPLIT=([\w"-]*)''')
train_test_l=l.findall(ls)

#### The following code excludes all news articles from the dataframe which have no topics assigned

In [7]:
indices=[i for i,val in enumerate(topic) if val==['None'] ]
topic1 = topic[:]
for i in indices:
    topic.remove(topic1[i]) 
body1=body[:]
for i in indices:
    body.remove(body1[i])               
train_test_c1=train_test_c[:]
for i in indices:
    train_test_c.remove(train_test_c1[i])     
train_test_l1=train_test_l[:]
for i in indices:
    train_test_l.remove(train_test_l1[i])

#### The function preprocess_string does the following:
<br>        1. everything apart from letters is excluded
<br>         2. multiple spaces are replaced by single space
<br>         3. str_arg is converted to lower case  

In [8]:
def preprocess_string(str_arg):
    cleaned_str=re.sub('[^a-z\s]+',' ',str_arg,flags=re.IGNORECASE) #every char except alphabets is replaced
    cleaned_str=re.sub('(\s+)',' ',cleaned_str) #multiple spaces are replaced by single space
    cleaned_str=cleaned_str.lower() #converting the cleaned string to lower case
    return cleaned_str # returning the preprocessed string 

In [9]:
Body=[]
for i in range(len(body)):
    Body.append((preprocess_string(body[i][0])))

In [10]:
df_tuples=list(zip(topic,Body,train_test_c,train_test_l))
df = pd.DataFrame(df_tuples, columns = ['Topics', 'Body', 'Train_test_c','Train_test_l']) 

In [11]:
df.drop_duplicates(subset = "Body" ,inplace = True)
df.reset_index(drop=True,inplace=True)

#### The following function replicates a news body corresponding to each assigned topics to it

In [12]:
def explode(df, lst_cols, fill_value='', preserve_index=False):
    # make sure `lst_cols` is list-alike
    if (lst_cols is not None
        and len(lst_cols) > 0
        and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)
    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()
    # preserve original index values    
    idx = np.repeat(df.index.values, lens)
    # create "exploded" DF
    res = (pd.DataFrame({
                col:np.repeat(df[col].values, lens)
                for col in idx_cols},
                index=idx)
             .assign(**{col:np.concatenate(df.loc[lens>0, col].values)
                            for col in lst_cols}))
    # append those rows that have empty lists
    if (lens == 0).any():
        # at least one list in cells is empty
        res = (res.append(df.loc[lens==0, idx_cols], sort=False)
                  .fillna(fill_value))
    # revert the original index order
    res = res.sort_index()
    # reset index if requested
    if not preserve_index:        
        res = res.reset_index(drop=True)
    return res

In [13]:
df_rep=explode(df,["Topics"], fill_value='', preserve_index=True)
df_rep.shape

(13623, 4)

In [14]:
df_rep.head()

Unnamed: 0,Body,Train_test_c,Train_test_l,Topics
0,showers continued throughout the week in the b...,"""TRAINING-SET""","""TRAIN""",cocoa
1,the u s agriculture department reported the fa...,"""TRAINING-SET""","""TRAIN""",grain
1,the u s agriculture department reported the fa...,"""TRAINING-SET""","""TRAIN""",wheat
1,the u s agriculture department reported the fa...,"""TRAINING-SET""","""TRAIN""",corn
1,the u s agriculture department reported the fa...,"""TRAINING-SET""","""TRAIN""",barley


#### The following code assigns unique numeric labels to each of the topics 

In [15]:
from sklearn.preprocessing import LabelEncoder 
leb=LabelEncoder()
df_rep["Topic_Encoded"]=leb.fit_transform(df_rep['Topics'])

### Test train split of data according to predefined lables

In [16]:
Topic_train_c = df_rep[df_rep['Train_test_c']== '"TRAINING-SET"']["Topic_Encoded"] 
Topic_test_c = df_rep[df_rep['Train_test_c']!= '"TRAINING-SET"']["Topic_Encoded"] 
Body_train_c = df_rep[df_rep['Train_test_c']== '"TRAINING-SET"']["Body"] 
Body_test_c = df_rep[df_rep['Train_test_c']!= '"TRAINING-SET"']["Body"] 

In [17]:
Topic_train_l = df_rep[df_rep['Train_test_l']== '"TRAIN"']["Topic_Encoded"] 
Topic_test_l = df_rep[df_rep['Train_test_l']== '"TEST"']["Topic_Encoded"] 
Body_train_l = df_rep[df_rep['Train_test_l']== '"TRAIN"']["Body"] 
Body_test_l = df_rep[df_rep['Train_test_l']== '"TEST"']["Body"] 

### Test train split of data according to random allocation (80:20)

In [18]:
from sklearn.model_selection import train_test_split

X = df_rep['Body']
y = df_rep['Topic_Encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17)

print("Shape of X is {}".format(X.shape))
print("Shape of X_train is {} and shape of y_train is {}".format(X_train.shape, y_train.shape))
print("Shape of X_test is {} and shape of y_test is {}".format(X_test.shape, y_test.shape))

Shape of X is (13623,)
Shape of X_train is (10217,) and shape of y_train is (10217,)
Shape of X_test is (3406,) and shape of y_test is (3406,)


#### Here we use tfidf vectorizer to give more weightage to  relevant words in the news document

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
vectorizer = TfidfVectorizer()

### Multinomial Naive Bayes
#### The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

In [20]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()

In [21]:
pipeline_bayes = Pipeline([('vectorizer', TfidfVectorizer()),('classifier', naive_bayes)])

### Case1:

#### Topic_train,Body_train and Topic_test,Body_test are respectively training and testing datasets which are already defined as in the Reuters documentation


#### sub case 1: Using CGSPLIT 

In [22]:
pipeline_bayes.fit(Body_train_c, Topic_train_c)
label_predicted = pipeline_bayes.predict(Body_test_c) #Prediction on testing data

In [23]:
#Accuracy of the model using Multinomial Naive Bayes. 

accuracy = accuracy_score(Topic_test_c, label_predicted)
print("Accuracy: ", accuracy)

Accuracy:  0.4123422159887798


#### sub case 2: Using LWISSPLIT

In [24]:
pipeline_bayes.fit(Body_train_l, Topic_train_l)
label_predicted = pipeline_bayes.predict(Body_test_l) #Prediction on testing data

In [25]:
accuracy = accuracy_score(Topic_test_l, label_predicted)
print("Accuracy: ", accuracy)

Accuracy:  0.3229780396357793


### Case 2:

#### X_train,y_train and X_test,y_test are respectively training and testing datasets which are generated random allocation (80:20)

In [26]:
pipeline_bayes.fit(X_train, y_train) #Fitting the multinomial model to training data.
label_predicted = pipeline_bayes.predict(X_test) #testing on the test data.

In [27]:
#Accuracy using multinomial naive bayes

accuracy = accuracy_score(y_test, label_predicted)
print("Accuracy: ", accuracy)

Accuracy:  0.3884321785085144


### Complement Naive Bayes
#### The Complement Naive Bayes classifier was designed to correct the “severe assumptions” made by the standard Multinomial Naive Bayes classifier. It is particularly suited for imbalanced data sets.

In [28]:
from sklearn.naive_bayes import ComplementNB
naive_bayes_c = ComplementNB()

In [29]:
pipeline_bayes_c = Pipeline([('vectorizer', TfidfVectorizer()),('classifier', naive_bayes_c)])

### Case 1:

#### Topic_train,Body_train and Topic_test,Body_test are respectively training and testing datasets which are already defined as in the Reuters documentation


#### sub case 1: Using CGSPLIT

In [30]:
pipeline_bayes_c.fit(Body_train_c, Topic_train_c)
label_predicted = pipeline_bayes_c.predict(Body_test_c) #Prediction on testing data

In [31]:
#Accuracy of the model using Multinomial Naive Bayes. 

accuracy = accuracy_score(Topic_test_c, label_predicted)
print("Accuracy: ", accuracy)

Accuracy:  0.6409537166900421


#### sub case 2:Using LEWISSPLIT

In [32]:
pipeline_bayes_c.fit(Body_train_l, Topic_train_l)
label_predicted = pipeline_bayes_c.predict(Body_test_l) #Prediction on testing data

In [33]:
#Accuracy of the model using Multinomial Naive Bayes. 

accuracy = accuracy_score(Topic_test_l, label_predicted)
print("Accuracy: ", accuracy)

Accuracy:  0.6116764863417247


### Case 2

#### X_train,y_train and X_test,y_test are respectively training and testing datasets which are generated random allocation (80:20)

In [34]:
pipeline_bayes_c.fit(X_train, y_train) #Fitting the multinomial model to training data.
label_predicted = pipeline_bayes_c.predict(X_test) #testing on the test data.

In [35]:
#Accuracy using multinomial naive bayes

accuracy = accuracy_score(y_test, label_predicted)
print("Accuracy: ", accuracy)

Accuracy:  0.4741632413388139


### skmultilearn transform multi-label problem to a multi-class problem.Label Powerset is a problem transformation approach to multi-label classification that transforms a multi-label problem to a multi-class problem with 1 multi-class classifier trained on all unique label combinations found in the training data.

### Transform between iterable of iterables and a multilabel format.Although a list of sets or tuples is a very intuitive format for multilabel data, it is unwieldy to process. This transformer converts between this intuitive format and the supported multilabel format: a (samples x classes) binary matrix indicating the presence of a class label.

In [36]:
from skmultilearn.problem_transform import LabelPowerset
from sklearn.preprocessing import MultiLabelBinarizer

In [37]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['Topics'])

In [38]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB

X_train, X_test, y_train, y_test = train_test_split(df['Body'] , y , random_state = 42)

tfidf_transformer = TfidfVectorizer(token_pattern = r'\b[^\d\W]+\b')
tfidf_transformer.fit(X_train)
X_train_tfidf=tfidf_transformer.transform(X_train)

### Fitting of complement Naive Bayes model with labelpowerset

In [39]:
classifier = LabelPowerset(ComplementNB())
classifier.fit(X_train_tfidf,y_train)

LabelPowerset(classifier=ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False),
       require_dense=[True, True])

### Fitting of Multinomial Naive Bayes model with labelpowerset

In [40]:
classifier_1 = LabelPowerset(MultinomialNB())
classifier_1.fit(X_train_tfidf,y_train)

LabelPowerset(classifier=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       require_dense=[True, True])

In [41]:
X_test_tfidf=tfidf_transformer.transform(X_test)

### Prediction with complement NB

In [42]:
pred = classifier.predict(X_test_tfidf)

### Prediction with Multinomial NB

In [43]:
pred_1 = classifier_1.predict(X_test_tfidf)

In [44]:
from sklearn.metrics import accuracy_score

### Accuracy score of Complement NB

In [45]:
print(accuracy_score(y_test,pred.toarray()))

0.5447816432272391


### Accuracy score of Multinomial NB

In [46]:
print(accuracy_score(y_test,pred_1.toarray()))

0.4585492227979275


### Prediction of multiple class for complement NB

In [47]:
mlb.inverse_transform(pred.toarray())

[('coffee',),
 ('acq',),
 ('dlr', 'money-fx'),
 ('earn',),
 ('dlr', 'money-fx'),
 ('earn',),
 ('acq',),
 ('acq',),
 ('sugar',),
 ('earn',),
 ('acq',),
 ('acq',),
 ('earn',),
 ('acq',),
 ('earn',),
 ('earn',),
 ('earn',),
 ('interest',),
 ('earn',),
 ('acq',),
 ('earn',),
 ('earn',),
 ('acq',),
 ('corn', 'grain', 'oilseed', 'soybean', 'wheat'),
 ('earn',),
 ('interest', 'money-fx'),
 ('acq',),
 ('acq',),
 ('gnp',),
 ('earn',),
 ('earn',),
 ('acq',),
 ('acq',),
 ('acq',),
 ('earn',),
 ('acq',),
 ('earn',),
 ('acq',),
 ('acq',),
 ('sugar',),
 ('acq',),
 ('dlr', 'money-fx'),
 ('earn',),
 ('earn',),
 ('earn',),
 ('interest',),
 ('crude', 'ship'),
 ('earn',),
 ('earn',),
 ('earn',),
 ('cotton',),
 ('earn',),
 ('acq',),
 ('acq',),
 ('earn',),
 ('acq',),
 ('earn',),
 ('crude',),
 ('gold', 'reserves'),
 ('earn',),
 ('interest', 'money-fx'),
 ('copper',),
 ('acq',),
 ('earn',),
 ('money-fx',),
 ('acq',),
 ('earn',),
 ('earn',),
 ('money-fx',),
 ('crude',),
 ('acq',),
 ('veg-oil',),
 ('earn',),
 

### Prediction of multiple class for Multinomial NB

In [48]:
mlb.inverse_transform(pred_1.toarray())

[('earn',),
 ('acq',),
 ('earn',),
 ('earn',),
 ('earn',),
 ('earn',),
 ('acq',),
 ('acq',),
 ('acq',),
 ('earn',),
 ('acq',),
 ('acq',),
 ('earn',),
 ('acq',),
 ('earn',),
 ('earn',),
 ('earn',),
 ('earn',),
 ('earn',),
 ('acq',),
 ('earn',),
 ('earn',),
 ('acq',),
 ('earn',),
 ('earn',),
 ('earn',),
 ('acq',),
 ('acq',),
 ('earn',),
 ('earn',),
 ('earn',),
 ('acq',),
 ('acq',),
 ('earn',),
 ('earn',),
 ('acq',),
 ('earn',),
 ('acq',),
 ('acq',),
 ('earn',),
 ('acq',),
 ('earn',),
 ('earn',),
 ('earn',),
 ('earn',),
 ('earn',),
 ('earn',),
 ('earn',),
 ('earn',),
 ('earn',),
 ('earn',),
 ('earn',),
 ('acq',),
 ('acq',),
 ('earn',),
 ('acq',),
 ('earn',),
 ('acq',),
 ('earn',),
 ('earn',),
 ('earn',),
 ('earn',),
 ('acq',),
 ('earn',),
 ('earn',),
 ('acq',),
 ('earn',),
 ('earn',),
 ('acq',),
 ('earn',),
 ('acq',),
 ('earn',),
 ('earn',),
 ('earn',),
 ('acq',),
 ('earn',),
 ('acq',),
 ('earn',),
 ('earn',),
 ('earn',),
 ('acq',),
 ('earn',),
 ('acq',),
 ('acq',),
 ('earn',),
 ('acq',),