# CS561/571 Artificial Intelligence
## Mid Semester Assignment

### Multinomial Model of Naive Bayes Classifier

<table style="font-size:15px">
    <thead>
        <td><b>Name of Student</b></td>
        <td><b>Roll No.</b></td>
        <td><b>Date</b></td>
    </thead>
    <tr>
        <td>M. Maheeth Reddy</td>
        <td>1801CS31</td>
        <td>22-Sep-2021</td>
    </tr>
</table>

**NOTE**: Before running this notebook, Please execute the following commands in Python shell if you have not performed them before
```python3
nltk.download('stopwords')
nltk.download('wordnet')
```

In [1]:
# Importing Libraries
import math
import re

from collections import Counter
# Please perform above mentioned commands before running this notebook
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
# Function to pre-process the dataset
# basically removing punctuation marks 
# and converting text to lower case
def preprocess_data(text):
    return re.sub(r'[^\w\s]', '', text).lower()
    
# Function to read, pre-process and 
# identify unique labels in the data
def read_dataset(dataset_name):
    with open(dataset_name, 'r') as dataset:
        lines = dataset.readlines()
        lines = [line.rstrip() for line in lines]

        data, labels = [], []
        for line in lines:
            label_data = line.split('->')
            labels.append(label_data[0])
            data.append(preprocess_data(label_data[1]))
        
    return data, labels

data, labels = read_dataset('dataset_file')
# Print the labels in the dataset 
# and number of items in the dataset
print('We have these labels in the dataset: ', ', '.join(set(labels)))
print(f'There are {len(data)} items in the dataset')

We have these labels in the dataset:  DL, CV
There are 3 items in the dataset


In [3]:
# Python Class to implement Multinomial Naive Bayes Classifier
class Classifier():
    # Constructor function
    def __init__(self, dl_data, cv_data):
        # count of words in DL class
        self.dl_word_count = self.get_counts(dl_data)

        # count of words in CV class
        self.cv_word_count = self.get_counts(cv_data)
        
        # Total number of features in the DL class
        self.dl_total_words = sum([value for _,value in self.dl_word_count.items()])
        
        # Total number of features in the CV class
        self.cv_total_words = sum([value for _,value in self.cv_word_count.items()])
        
        # Total number of words accross all documents
        self.words_count = len(self.dl_word_count) + len(self.cv_word_count)
        
        # Total number of DL documents
        self.dl_doc_count = len(dl_data)
        
        # Total number of CV documents
        self.cv_doc_count = len(cv_data)
        
        # Total number of all features
        self.doc_count = self.dl_doc_count + self.cv_doc_count

        # Setting the a priori class probablities
        self.prior_prob_dl = math.log(self.dl_doc_count/self.doc_count)
        self.prior_prob_cv = math.log(self.cv_doc_count/self.doc_count)

    # get counts of each word from data of a class
    def get_counts(self, class_data):
        wordnet_lemmatizer = WordNetLemmatizer()
        stopwordset = set(stopwords.words('english'))
        
        class_word_count = Counter()
        for class_texts in class_data:
            class_words = []
            for word in class_texts.split(' '):
                if word not in stopwordset and word != '':
                    lemmatized = wordnet_lemmatizer.lemmatize(word)
                    class_words.append(lemmatized)
            
            for word in class_words:
                class_word_count[word] += 1
        
        return class_word_count
        
    # Creates the feature vector for DL and CV class
    def train(self):
        self.features = {}
        self.features['dl_features'] = {}
        self.features['cv_features'] = {}

        # Probablity of each word in DL class
        for word, count in self.dl_word_count.items():
            prob = (count + 1)/(self.dl_total_words + self.words_count)
            feat_val = math.log(prob)
            self.features['dl_features'][word] = feat_val
        
        # Probablity of each word in CV class
        for word, count in self.cv_word_count.items():
            prob = (count + 1)/(self.cv_total_words + self.words_count)
            feat_val = math.log(prob)
            self.features['cv_features'][word] = feat_val

    # predicts class for a test document
    def test(self, document):
        wordnet_lemmatizer = WordNetLemmatizer()
        stops = set(stopwords.words('english'))
        
        document_ = []
        for x in document.split(" "):
            if x not in stops and x != '':
                lemmatize = wordnet_lemmatizer.lemmatize(x)
                document_.append(lemmatize)
        document = document_
        
        dl_val = self.prior_prob_dl
        cv_val = self.prior_prob_cv

        # Initializing the smoothing probabilites
        smooth_dl = math.log(1/(self.dl_total_words + self.doc_count))
        smooth_cv = math.log(1/(self.cv_total_words + self.doc_count))

        # Updating the scores for DL class
        for word in document:
            if word in self.features['dl_features']:
                dl_val += self.features['dl_features'][word]
            elif word in self.features['cv_features']:
                dl_val += smooth_dl
        
        # Updating the scores for CV class
        for word in document:
            if word in self.features['cv_features']:
                cv_val += self.features['cv_features'][word]
            elif word in self.features['dl_features']:
                cv_val += smooth_cv
        
        return ("DL", dl_val) if dl_val >= cv_val else ("CV", cv_val)

In [4]:
# Splitting data into dl and cv
def split_dataset(data, labels):
    dl_data = []
    cv_data = []
    for i in range(len(data)):
        if labels[i] == "DL":
            dl_data.append(data[i])
        elif labels[i] == "CV":
            cv_data.append(data[i])
        else:
            print("Labels have not been extracted properly")
    
    assert len(dl_data) + len(cv_data) == len(data)
    return dl_data, cv_data

# split the dataset into DL class and CV class
dl_data, cv_data = split_dataset(data, labels)

# create the classifier and train
classifier = Classifier(dl_data, cv_data)
classifier.train()

In [5]:
# predict the class for test document
D4 = "Deep learning based computer vision methods have been used for facial recognition."
pred_label, _ = classifier.test(D4)
print(f"Document D4 has been classified into Class {pred_label}")

Document D4 has been classified into Class CV


In [6]:
# print the feature values for DL Class
print("Feature Values for DL Class\n")
for word,value in classifier.features['dl_features'].items():
    print(f"{word}: {value}")


Feature Values for DL Class

recent: -3.5553480614894135
year: -3.5553480614894135
researcher: -3.5553480614894135
computer: -3.5553480614894135
vision: -3.5553480614894135
proposed: -3.5553480614894135
many: -3.5553480614894135
deep: -2.8622008809294686
learning: -3.1498829533812494
dl: -3.5553480614894135
method: -3.1498829533812494
various: -3.5553480614894135
task: -3.5553480614894135
facial: -3.5553480614894135
recognition: -3.5553480614894135
fr: -3.1498829533812494
made: -3.5553480614894135
enormous: -3.5553480614894135
leap: -3.5553480614894135
using: -3.5553480614894135
technique: -3.5553480614894135
system: -3.5553480614894135
benefit: -3.5553480614894135
hierarchical: -3.5553480614894135
architecture: -3.5553480614894135
learn: -3.5553480614894135
discriminative: -3.5553480614894135
face: -3.5553480614894135
representation: -3.5553480614894135


In [7]:
# print the feature values for CV Class
print("Feature Values for CV Class\n")
for word,value in classifier.features['cv_features'].items():
    print(f"{word}: {value}")

Feature Values for CV Class

computer: -3.068052935133617
vision: -3.068052935133617
method: -3.068052935133617
widely: -3.068052935133617
used: -3.068052935133617
facial: -3.068052935133617
recognition: -3.068052935133617
