<a href="https://colab.research.google.com/github/Nukaraju2003/Natural-Language-Processing/blob/main/NLP2assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

![img](https://s3.ap-south-1.amazonaws.com/acsess/images/96bb925a-153c-4ff5-bb1f-5f85ba048b2c)

# Naive Bayes implementation
We shall undertand Python object oreinted implementation of this algorithm.

Example case: we want to determine gender from name.

In [None]:
import nltk
nltk.download('names')
from nltk.corpus import names

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!


## Feature Design
Here we would plan features for gender identification

### Possible features

* ending character

In [None]:
def gender_features(word):
  """
  This function will take a word and return a dictionary of gender features
  {'feature_name''feature_value}
  """
  word = word.strip().lower()
  return {'last_letter':word[-1]}

In [None]:
data = [(gender_features(name), 'female') for name in names.words("female.txt")] \
  + [(gender_features(name), 'male') for name in names.words("male.txt")]

In [None]:
import random
random.shuffle(data)

In [None]:
n = len(data)
import math
t_n = math.ceil(0.9*n)
train_data, test_data = data[:t_n], data[t_n:]

In [None]:
print(len(train_data), train_data[:10])

7150 [({'last_letter': 'l'}, 'male'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'y'}, 'male'), ({'last_letter': 'd'}, 'male'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'e'}, 'male'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'e'}, 'male')]


## Probability add-alpha implementation
Below class will implement converting counts (or frequencies) across various values into a probability distribution with add alpha smoothing

In [None]:
import math
_NINF = float('-1e300')

class ProbDist:
  def __init__(self, freqdist, alpha=0.5, bins=None):
    self._alpha = alpha
    self._freqdist = freqdist
    n = sum(freqdist.values()) # find total value across all bins
    self._bins = len(freqdist) if bins is None else bins
    self._divisor = n + self._bins*self._alpha

  def prob(self, sample):
    c = self._freqdist.get(sample, 0) + self._alpha
    return c/self._divisor

  def logprob(self, sample):
    p = self.prob(sample)
    return math.log(p, 2) if p!=0 else _NINF

  def samples(self):
    return self._freqdist.keys()

## Log Softmax
(Optional)
This is copied from NLTK as is. It tries to implement log softmax

we have log probabilities already and not probabilities

If probabilities are there then softmax = e^x/SUM(e^xi) for all i

For log softmax we need
logprob(x) - log of sum of all probs

But we have log(x1), log(x2) .... log(xn) the n logprobs for n labels

we need to calculate log(x1+x2+...+xn)

Below is an implementation of the same.

In [None]:
# need to add two logs
_ADD_LOGS_MAX_DIFF = math.log(1e-30, 2)
def add_logs(logx, logy):
  """
  Ideally log(x+y) = log(2**logx + 2**logy)
  as we already have logx and logy
  below implementation takes care of overflow errors
  (copied from NLTK source code)

  NOTE: These work when all logprobs are less than 1
  
  """
  if (logx < logy + _ADD_LOGS_MAX_DIFF):
    return logy
  elif (logy < logx + _ADD_LOGS_MAX_DIFF):
    return logx
  
  base = min(logx, logy)
  return base + math.log(2**(logx-base) + 2**(logy-base), 2)

from functools import reduce

def sum_logs(logs):
  return reduce(add_logs, logs[1:], logs[0]) if len(logs)!=0 else _NINF

class LogSoftmax:
  def __init__(self, prob_dict):
    self._prob_dict = prob_dict
    assert len(prob_dict)>0, "There must be at least one class"

    value_sum = sum_logs(list(self._prob_dict.values()))

    if value_sum <= _NINF:
      pass #todo: return equal prob for each class
    else:
      for (x,p) in self._prob_dict.items():
        self._prob_dict[x] -= value_sum

    self._max = max((value, label) for (label, value) in self._prob_dict.items())


  def probs(self):
    return dict(sorted(self._prob_dict.items(), key=lambda item: -item[1]))

  def max(self):
    return self._max[1]

In [None]:
from nltk.tbl import feature
class NaiveBayesClassifier:
  """
  Implements a Naive Bayes Classifier
  """

  def __init__(self, training_dataset, alpha=0.5):
    """
    trains a naive bayes classifier on the. training dataset

    the training dataset must be a list of [(feature dict, class label)]
    where feature dict is a dict of featur names and their corresponding
    values for every training example.
    """
    # do some validation if the format is right
    # now train if valid
    self._train(training_dataset, alpha)

  def _train(self, training_dataset, alpha):
    """
    Train NB classifier and return
    - log prior class probabilities
    - log likelihoods for all features in the vocabulary
    - vocabulary
    """
    feature_freqdist = {}
    feature_values = {}
    feature_names = set()
    class_freqdist = {}

    for featureset, label in training_dataset:
      class_freqdist[label] = class_freqdist.get(label, 0)+1 #P(c)

      for fname, fval in featureset.items():
        # calculate P(fi|c)
        # we need to find counts for each feature value for each class
        feature_freqdist[(label, fname)] = feature_freqdist.get((label, fname), {})
        feature_freqdist[(label, fname)][fval] = feature_freqdist[(label, fname)].get(fval, 0) + 1

        # we need to find vocabulary for each feature
        feature_values[fname] = feature_values.get(fname, set())
        feature_values[fname].add(fval)

        # we need to get list of all features across all training example
        feature_names.add(fname)

    #print(feature_freqdist)
    #print(feature_values)
    #print(feature_names)
    #print(class_freqdist)

    # Now that we have our counts, we need to calculate the probability
    # distributions, with add-alpha smoothing
    
    # handle missing feature values in a class
    # this is optional and may not be needed
    # so commented for now
    # for label in class_freqdist:
    #   num_samples = class_freqdist[label]
    #   for fname in feature_names:
    #     count = sum(feature_freqdist[(label, fname)].values())
    #     if num_samples > count: #case where in my input data set certain samples donot have certain features
    #       feature_freqdist[(label, fname)][None] = num_samples - count
    #       feature_values[fname].add(None)

    # do the probabilities
    self._class_probdist = ProbDist(class_freqdist)
    self._feature_probdist = {}
    for ((label, fname), freqdist) in feature_freqdist.items():
      self._feature_probdist[(label, fname)] = ProbDist(freqdist, bins=len(feature_values[fname]))
    self._labels = list(self._class_probdist.samples())

    print(self._class_probdist)
    print(self._feature_probdist)
    print(self._labels)


  def prob_classify(self, featureset):
    """
    Gives the argmax or the class predicted or most probable for the given 
    feature set
    """
    # remove unknown features
    features = {}
    for fname in featureset.keys():
      for label in self._labels:
        if (label, fname) in self._feature_probdist:
          features[fname] = featureset[fname]
          break

    logprob = {} #for each class I need to calculate these logprob
    for label in self._labels:
      logprob[label] = self._class_probdist.logprob(label) #P(c)
      for (fname, fval) in features.items():
        if (label, fname) in self._feature_probdist:
          logprob[label] += self._feature_probdist[(label, fname)].logprob(fval)
        else: #this case should never arise if we are generating the prob dist ourselves
          logprob[label] += _NINF

    # need to convert this into a probability that sums to 1
    # log softmax
    #return LogSoftmax(logprob)
    return logprob
    
  def classify(self, featureset):
    p = self.prob_classify(featureset)
    print(p)
    l = sorted(p.items(), key=lambda x:-x[1])
    
    #return self.prob_classify(featureset).max()
    return l[0][0]

In [None]:
nbc = NaiveBayesClassifier(train_data)

<__main__.ProbDist object at 0x7ff15e6ad310>
{('male', 'last_letter'): <__main__.ProbDist object at 0x7ff15e6adb90>, ('female', 'last_letter'): <__main__.ProbDist object at 0x7ff15e6ad1d0>}
['male', 'female']


In [None]:
nbc.classify(gender_features("Neo"))

{'male': -5.625622858137834, 'female': -7.877015771643056}


'male'

In [None]:
def test(data):
  c = 0
  for d, label in data:
    c+=(nbc.classify(d)==label)
  return c/len(data)

In [None]:
test(test_data)

0.7707808564231738