<a href="https://colab.research.google.com/github/RitvikVankayala/NLP/blob/main/TextClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [61]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt

File ‘edgar_allan_poe.txt’ already there; not retrieving.

File ‘robert_frost.txt’ already there; not retrieving.



In [62]:
import numpy as np
import matplotlib.pyplot as plt
import string 
from sklearn.model_selection import train_test_split

In [63]:
ip_files=['edgar_allan_poe.txt','robert_frost.txt']

In [64]:
!head edgar_allan_poe.txt

LO! Death hath rear'd himself a throne
In a strange city, all alone,
Far down within the dim west
Where the good, and the bad, and the worst, and the best,
Have gone to their eternal rest.
 
There shrines, and palaces, and towers
Are not like any thing of ours
Oh no! O no! ours never loom
To heaven with that ungodly gloom!


In [65]:
!head robert_frost.txt

Two roads diverged in a yellow wood,
And sorry I could not travel both
And be one traveler, long I stood
And looked down one as far as I could
To where it bent in the undergrowth; 

Then took the other, as just as fair,
And having perhaps the better claim
Because it was grassy and wanted wear,
Though as for that the passing there


In [66]:
# collecting data into lists
input_texts=[]
labels=[]

In [67]:
for label,f in enumerate(ip_files):
  print(f"{f} corresponds to label {label}")
# open(f) will give the info about the poem with lines
# while using open in py we will get an extra new line char to remove it we will use rstrip()
  for line in open(f):
    line=line.rstrip().lower()
    if line:
      #remove punctuation
      line=line.translate(str.maketrans('','',string.punctuation))

      input_texts.append(line)
      labels.append(label)

edgar_allan_poe.txt corresponds to label 0
robert_frost.txt corresponds to label 1


In [68]:
train_text,test_text,Ytrain,Ytest=train_test_split(input_texts,labels)

In [69]:
len(Ytrain),len(Ytest)

(1615, 539)

In [70]:
train_text[:5]

['it took the hall door for the novelty',
 'to where it bent in the undergrowth',
 'when all who would come seeking in new hampshire',
 'once fair and stately palace ',
 'the surest thing there is is we are riders']

In [71]:
Ytrain[:5]

[1, 1, 1, 0, 1]

In [72]:
idx=1
word2idx={'<unk>':0}

In [73]:
# populate the word2idx

for text in train_text:
  tokens=text.split()
  for token in tokens:
    if token not in word2idx:
      word2idx[token]=idx
      idx+=1

In [74]:
word2idx

{'<unk>': 0,
 'it': 1,
 'took': 2,
 'the': 3,
 'hall': 4,
 'door': 5,
 'for': 6,
 'novelty': 7,
 'to': 8,
 'where': 9,
 'bent': 10,
 'in': 11,
 'undergrowth': 12,
 'when': 13,
 'all': 14,
 'who': 15,
 'would': 16,
 'come': 17,
 'seeking': 18,
 'new': 19,
 'hampshire': 20,
 'once': 21,
 'fair': 22,
 'and': 23,
 'stately': 24,
 'palace': 25,
 'surest': 26,
 'thing': 27,
 'there': 28,
 'is': 29,
 'we': 30,
 'are': 31,
 'riders': 32,
 'got': 33,
 'them': 34,
 'off': 35,
 'wild': 36,
 'flowers': 37,
 'backs': 38,
 'but': 39,
 'burden': 40,
 'of': 41,
 'its': 42,
 'body': 43,
 'song': 44,
 'among': 45,
 'unearthed': 46,
 'potatoes': 47,
 'standing': 48,
 'still': 49,
 'nothings': 50,
 'too': 51,
 'good': 52,
 'say': 53,
 'pays': 54,
 'cross': 55,
 'lots': 56,
 'walls': 57,
 'everything': 58,
 'having': 59,
 'interfered': 60,
 'huse': 61,
 'business': 62,
 'then': 63,
 'up': 64,
 'i': 65,
 'think': 66,
 'ill': 67,
 'get': 68,
 'away': 69,
 'fight': 70,
 'a': 71,
 'smother': 72,
 'hog': 73,
 '

In [75]:
len(word2idx)

2526

In [76]:
# converting text to numbers

train_text_int=[]
test_text_int=[]

for text in train_text:
  tokens=text.split()
  line_as_int=[word2idx[token] for token in tokens]
  train_text_int.append(line_as_int)

for text in test_text:
  tokens=text.split()
  line_as_int=[word2idx.get(token,0) for token in tokens]
  test_text_int.append(line_as_int)

In [77]:
train_text_int[100:105]

[[15, 381, 14, 382, 90, 383, 384, 385],
 [23, 386, 11, 387, 23, 388, 389],
 [292, 390, 391, 392, 11, 3, 393, 142],
 [41, 14, 8, 337, 394, 395, 29, 3, 287],
 [396, 269, 23, 397, 198, 292, 398, 399]]

In [78]:
# now we have to generate the A and Pie matixes for the prediction
# for add ones smoothing
V=len(word2idx)

A0=np.ones((V,V))
pi0=np.ones(V)

A1=np.ones((V,V))
pi1=np.ones(V)

In [79]:
# compute the counts for the A and pi

def compute_counts(text_as_int,A,pi):
  for tokens in text_as_int:

    # for finding whether a word is in first place or not
    last_idx=None

    for idx in tokens:

      # if the word is in the first position we will populate the pi 
      if last_idx is None:
        pi[idx]+=1
      else:

        # or else we will populate A matrix
        A[last_idx,idx]+=1

      #updating the last_idx
      last_idx=idx


In [80]:
compute_counts([t for t,y in zip(train_text_int,Ytrain) if y==0],A0,pi0)
compute_counts([t for t,y in zip(train_text_int,Ytrain) if y==1],A1,pi1)

In [81]:
# converting count into the probabilities

A0/=A0.sum(axis=1,keepdims=True)
pi0/=pi0.sum()

A1/=A1.sum(axis=1,keepdims=True)
pi1/=pi1.sum()


In [82]:
# converting them to logarithamic values

logA0=np.log(A0)
logpi0=np.log(pi0)

logA1=np.log(A1)
logpi1=np.log(pi1)

In [83]:
#  computing the probability that a given word is of label 0 or 1

count0=sum(y==0 for y in Ytrain)
count1=sum(y==1 for y in Ytrain)
total=len(Ytrain)
p0=count0/total
p1=count1/total

logp0=np.log(p0)
logp1=np.log(p1)

p0,p1

(0.3331269349845201, 0.6668730650154798)

In [84]:
# Building classifier

class Classifier:
  def __init__(self,logAs,logPis,logPriors):
    self.logAs=logAs
    self.logPis=logPis
    self.logPriors=logPriors
    # for the number of classes
    self.K=len(logPriors)

  def _compute_log_likelihood(self,input_,class_):
     logA=self.logAs[class_]
     logPi=self.logPis[class_]

     last_idx=None
     logProb=0

     for idx in input_:

       if last_idx is None:
         logProb+=logPi[idx]

       else:
         logProb+=logA[last_idx,idx]

  # Giving the last idx the present one
       last_idx=idx

     return logProb

  def predict(self,inputs):
    predictions=np.zeros(len(inputs))
    for i,input_ in enumerate(inputs):
      posteriors=[self._compute_log_likelihood(input_,c)+self.logPriors[c]\
                 for c in range(self.K) ]
      pred=np.argmax(posteriors)
      predictions[i]=pred
    return predictions

In [85]:
# initiating a classifier

clf=Classifier([logA0,logA1],[logpi0,logpi1],[logp0,logp1])

In [86]:
Ptrain=clf.predict(train_text_int)
print(f"Train acc:{np.mean(Ptrain==Ytrain)}")

Train acc:0.9969040247678018


In [87]:
Ptest=clf.predict(test_text_int)
print(f"Test acc:{np.mean(Ptest==Ytest)}")

Test acc:0.8330241187384044


In [88]:
# KNOWING THE CONFUSION MATRIX AND THE F_SCORE

from sklearn.metrics import confusion_matrix,f1_score

In [89]:
cm=confusion_matrix(Ytrain,Ptrain)
cm

array([[ 533,    5],
       [   0, 1077]])

In [91]:
cm=confusion_matrix(Ytest,Ptest)
cm

array([[101,  79],
       [ 11, 348]])

In [92]:
f1_score(Ytrain,Ptrain)

0.9976841130152849

In [93]:
f1_score(Ytest,Ptest)

0.8854961832061069