In [2]:
##importing necessary libraries and the train data
import nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import utils
from matplotlib import pyplot as plt
import seaborn as sns
import  time
import warnings
warnings.filterwarnings(action = 'ignore')
import gensim
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
nltk.download('brown')
nltk.download('universal_tagset')
from nltk.corpus import brown
brown_tagged =brown.tagged_sents( tagset='universal')


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\bhavana\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\bhavana\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


# Words and tags for train data

In [3]:
brown_tagged_words=[]
brown_tagged_tags=[]
for sentence in brown_tagged:
    word_sentence=[]
    tag_sentence=[]
    for words in sentence:
        word_sentence.append(words[0])
        tag_sentence.append(words[1])
    brown_tagged_words.append(word_sentence)
    brown_tagged_tags.append(tag_sentence)


# Tokenizing

In [4]:
##encoding the words and tags to unique numbers because nn will work with numbers
'''det =5
   noun=1
   adv = 7
   verb =2
   adp=4
   adj=6
   .=3
   conj=9
   pron=8
   prt=10
   num=11
   x=12'''
word_tokenizer = Tokenizer()                      
word_tokenizer.fit_on_texts(brown_tagged_words)                    
brown_tagged_words_encoded = word_tokenizer.texts_to_sequences(brown_tagged_words)

tag_tokenizer = Tokenizer()                      
tag_tokenizer.fit_on_texts(brown_tagged_tags)                    
brown_tagged_tags_encoded = tag_tokenizer.texts_to_sequences(brown_tagged_tags)

In [5]:
dict={5:"DET",1:"NOUN",7:"ADV" ,2:"VERB",4:"ADP",6:"ADJ",3:".",9:"CONJ",8:"PRON" ,10:"PRT",11:"NUM",12:"X"}
   

# Padding

In [6]:
## we will pad now
lengths = [len(seq) for seq in brown_tagged_words_encoded]
print("Length of longest sentence: {}".format(max(lengths)))

MAX_SEQ_LENGTH = 180  
brown_tagged_words_padded = pad_sequences(brown_tagged_words_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")
brown_tagged_tags_padded = pad_sequences(brown_tagged_tags_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")

Length of longest sentence: 180


Implementing word2vector 

In [7]:
path = "GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin"
word2vec= KeyedVectors.load_word2vec_format(path, binary=True)
EMBEDDING_SIZE  = 300 
VOCABULARY_SIZE = len(word_tokenizer.word_index) + 1
embedding_weights = np.zeros((VOCABULARY_SIZE, EMBEDDING_SIZE))
word2id = word_tokenizer.word_index
for word, index in word2id.items():
    try:
        embedding_weights[index, :] = word2vec[word]
    except KeyError:
        pass

In [8]:
print(brown_tagged_tags_padded.shape)

(57340, 180)


One-hot representation for tags since it has less no. of tags

In [9]:
from tensorflow.keras.utils import to_categorical
brown_tagged_vect= to_categorical(brown_tagged_tags_padded)
print(brown_tagged_vect.shape)

(57340, 180, 13)


# Model Implementation

In [10]:
from keras import models
from keras import layers
from tensorflow.keras.layers import Embedding
import numpy as np
import tensorflow as tf
from sklearn.metrics import confusion_matrix
print(len(brown_tagged_words))
len_brown_data=len(brown_tagged_words_padded)/5
avg_loss=0.0
avg_acc=0.0
confusion_mat=[]
per_pos_acc=[]
Y_pred=[]
Y_true=[]
for i in range(5):
    first_part=int(i*(len_brown_data))
    second_part=int((i+1)*len_brown_data)
    X_train=np.concatenate((brown_tagged_words_padded[:first_part],brown_tagged_words_padded[second_part:]),axis=0)
    X_test =brown_tagged_words_padded[first_part:second_part]
    Y_train=np.concatenate((brown_tagged_vect[:first_part],brown_tagged_vect[second_part:]),axis=0)
    Y_test =brown_tagged_vect[first_part:second_part]
    Y_test_enc=brown_tagged_tags_encoded[first_part:second_part]
    X_train,X_validation=train_test_split(X_train,train_size=0.80,test_size=0.20,random_state = 50)
    Y_train,Y_validation=train_test_split(Y_train,train_size=0.80,test_size=0.20,random_state = 50)
    print(X_train.shape,Y_train.shape)
    network = models.Sequential() ## starting the ffnn
    network.add(Embedding(input_dim     = VOCABULARY_SIZE,
                             output_dim    = EMBEDDING_SIZE,
                             input_length  = MAX_SEQ_LENGTH,
                             weights       = [embedding_weights],
                             trainable     = False))
    network.add(layers.Dense(units=100, activation='ReLU'))
    network.add(layers.Dense(units=13, activation='softmax'))
    network.compile(loss='categorical_crossentropy', # Cross-entropy
                optimizer='rmsprop', # Root Mean Square Propagation
                metrics=['accuracy'])
    trainng = network.fit(X_train,Y_train,batch_size=128,epochs=3,validation_data=(X_validation, Y_validation))
    loss,accuracy = network.evaluate(X_test,Y_test,verbose=1)
    avg_loss=avg_loss+loss
    avg_acc=avg_acc+accuracy
    pred=network.predict(X_test)
    indices=np.argmax(pred, axis=2)
    #not_zero=indices[indices != 0]
    Y_pred=indices 
    Y_true=brown_tagged_tags_padded[first_part:second_part]

57340
(36697, 180) (36697, 180, 13)
Epoch 1/3
Epoch 2/3
Epoch 3/3
(36697, 180) (36697, 180, 13)
Epoch 1/3
Epoch 2/3
Epoch 3/3
(36697, 180) (36697, 180, 13)
Epoch 1/3
Epoch 2/3
Epoch 3/3
(36697, 180) (36697, 180, 13)
Epoch 1/3
Epoch 2/3
Epoch 3/3
(36697, 180) (36697, 180, 13)
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [11]:
network.save("my_model_1")
network.summary()



INFO:tensorflow:Assets written to: my_model_1\assets


INFO:tensorflow:Assets written to: my_model_1\assets


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 180, 300)          14944800  
                                                                 
 dense_8 (Dense)             (None, 180, 100)          30100     
                                                                 
 dense_9 (Dense)             (None, 180, 13)           1313      
                                                                 
Total params: 14,976,213
Trainable params: 31,413
Non-trainable params: 14,944,800
_________________________________________________________________


In [14]:
sentence=input()

from gensim import models
import tensorflow as tf
def sen_vec(sent):
    words=sent.split(" ")
    l = []
    for i in range(len(words)):
        x=0
        for j in range(len(brown_tagged_words_encoded)):
            for k in range(len(brown_tagged_words_encoded[j])):
                if(x==0 and words[i]==brown_tagged_words[j][k]):
                    l.append(int(brown_tagged_words_encoded[j][k]))
                    x=1
                    break
            if(x==1):
                break
        if(x==0):
            sim_word = Word2Vec.most_similar(positive=words[i],topn=10)[:][0]
            while(x==0):
                for j in range(len(brown_tagged_words_encoded)):
                    for k in range(len(brown_tagged_words_encoded[j])):
                        if(x==0 and words[i]==brown_tagged_words[j][k]):
                            l.append(int(brown_tagged_words_encoded[j][k]))
                            x=1
                            break
                    if(x==1):
                        break
                
                            
                        
                
    t=tf.convert_to_tensor(l)
    t=tf.reshape(t, [1, t.shape[0]])
    y = pad_sequences(t, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")
    return y

he is a boy


In [15]:
from tensorflow import keras
l=[]
l=sen_vec(sentence)
#print(l.shape,l)
dict={5:"DET",1:"NOUN",7:"ADV" ,2:"VERB",4:"ADP",6:"ADJ",3:".",9:"CONJ",8:"PRON" ,10:"PRT",11:"NUM",12:"X"}
model1 = keras.models.load_model('my_model_1')
pred=model1.predict(l)
ans=[]
x=sentence.split(" ")
start=MAX_SEQ_LENGTH-len(x)
print(pred.shape , start, MAX_SEQ_LENGTH)
for i in range(start,MAX_SEQ_LENGTH):
    indices = tf.argmax(pred[0][i])
    c = tf.keras.backend.eval(indices)
    #print(c)
    if c==0:
        print("***")
    else:
        print(dict[c])
    #ans.append(dict[c])
#print(ans)


(1, 180, 13) 176 180
PRON
VERB
***
NOUN


In [24]:
print(Y_true.shape,Y_pred.shape)
Y_true=np.reshape(Y_true, (2064240,1))
Y_pred=np.reshape(Y_pred, (2064240,1))
cm=confusion_matrix(Y_true, Y_pred)
print(cm.shape)

print(cm[1:,1:])


(11468, 180) (11468, 180)
(13, 13)
[[28781  1050     0    35     0   588   156    18     0     2   166     7]
 [ 1264 30368     0    64     1   262   153     0     0    53     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0     0]
 [   10     6     0 12871     0    53   201     0    22   315     0     0]
 [    3     0     0   569 15733     0     0     6    17     0     2     0]
 [  353   176     0    11     1  8337   498     0     0     3     0     0]
 [   97    48     0   526    93   536  8618     0    19   307     0     0]
 [   15     4     0   232   570     3     1 12939     0     0     0     0]
 [    0     0     0     3    12     0     0     0  1379     0     0     0]
 [  123    37     0   420     2    38   101     1     0  3021     0     0]
 [    0     0     0     0     0     0     0     0     0     0   986     0]
 [   75    10     0     2     0     5     3     0     0     0     0    12]]


In [34]:
tagset=["NOUN","VERB",".","ADP","DET","ADJ","ADV" ,"PRON" ,"CONJ","PRT","NUM","X"]
conf_df = pd.DataFrame(cm[1:,1:], columns = list(tagset),index=list(tagset))
display(conf_df)

Unnamed: 0,NOUN,VERB,.,ADP,DET,ADJ,ADV,PRON,CONJ,PRT,NUM,X
NOUN,28781,1050,0,35,0,588,156,18,0,2,166,7
VERB,1264,30368,0,64,1,262,153,0,0,53,0,0
.,0,0,0,0,0,0,0,0,0,0,0,0
ADP,10,6,0,12871,0,53,201,0,22,315,0,0
DET,3,0,0,569,15733,0,0,6,17,0,2,0
ADJ,353,176,0,11,1,8337,498,0,0,3,0,0
ADV,97,48,0,526,93,536,8618,0,19,307,0,0
PRON,15,4,0,232,570,3,1,12939,0,0,0,0
CONJ,0,0,0,3,12,0,0,0,1379,0,0,0
PRT,123,37,0,420,2,38,101,1,0,3021,0,0


In [25]:
from sklearn.metrics import precision_score
prec_score=precision_score(Y_true, Y_pred, average=None)
print(prec_score)

from sklearn.metrics import recall_score
rec_score=recall_score(Y_true, Y_pred, average=None)
print(rec_score)


[0.97455647 0.93685101 0.95801129 0.         0.87361705 0.95862783
 0.8488088  0.88562327 0.99807158 0.95963814 0.81626587 0.85441941
 0.63157895]
[1.         0.84575375 0.9404478  0.         0.69822068 0.78547179
 0.84596651 0.83678027 0.93869704 0.23083361 0.46384155 0.91977612
 0.06629834]


In [35]:
from sklearn.metrics import classification_report
print(classification_report(np.array(Y_true),np.array(Y_pred),digits=4))

              precision    recall  f1-score   support

           0     0.9746    1.0000    0.9871   1882694
           1     0.9369    0.8458    0.8890     34030
           2     0.9580    0.9404    0.9491     32291
           3     0.0000    0.0000    0.0000     29083
           4     0.8736    0.6982    0.7761     18434
           5     0.9586    0.7855    0.8635     20030
           6     0.8488    0.8460    0.8474      9855
           7     0.8856    0.8368    0.8605     10299
           8     0.9981    0.9387    0.9675     13784
           9     0.9596    0.2308    0.3721      5974
          10     0.8163    0.4638    0.5915      6513
          11     0.8544    0.9198    0.8859      1072
          12     0.6316    0.0663    0.1200       181

    accuracy                         0.9717   2064240
   macro avg     0.8228    0.6594    0.7008   2064240
weighted avg     0.9574    0.9717    0.9633   2064240



In [26]:
def FValues(precision , recall , n):
  # return (1+n*n)*prec*recall
  fval={}
  for i in range(13):
    fval[i]=(1+n*n)*(precision[i]*recall[i])/((n*n)*(precision[i])+recall[i])
    # fval[i]=2*precision[i]*recall[i]/(precision[i]+recall[i])
  return fval

f1=FValues(prec_score,rec_score,1)
f2=FValues(prec_score,rec_score,2)
f05=FValues(prec_score,rec_score,0.5)



print(f1)
print(f2)
print(f05)

prec_score_overall=precision_score(Y_true, Y_pred, average='weighted')
rec_score_overall=recall_score(Y_true, Y_pred, average='weighted')
print(prec_score_overall,rec_score_overall)
f_1=(1+1*1)*(prec_score_overall*rec_score_overall)/((1*1)*(prec_score_overall)+rec_score_overall)
f_2=(1+2*2)*(prec_score_overall*rec_score_overall)/((2*2)*(prec_score_overall)+rec_score_overall)
f_5=(1+0.5*0.5)*(prec_score_overall*rec_score_overall)/((0.5*0.5)*(prec_score_overall)+rec_score_overall)
print(f_1,f_2,f_5)
print(prec_score)
print(rec_score)


{0: 0.9871143081172807, 1: 0.8889746876496116, 2: 0.949148304422566, 3: nan, 4: 0.7761329031869026, 5: 0.8634542560781515, 6: 0.847385272145144, 7: 0.8605092361457813, 8: 0.9674742036787797, 9: 0.3721495074888679, 10: 0.5915410221264931, 11: 0.8858939802336029, 12: 0.11999999999999998}
{0: 0.9948055628973067, 1: 0.862527795925462, 2: 0.9439087919534014, 3: nan, 4: 0.7274299472131481, 5: 0.8149111175568723, 6: 0.84653344705739, 7: 0.8461130637972001, 8: 0.9500000000000001, 9: 0.2721746338767615, 10: 0.5076798978254294, 11: 0.9059169423006246, 12: 0.08075370121130553}
{0: 0.9795410694726642, 1: 0.9170947143021017, 2: 0.9544463092521702, 3: nan, 4: 0.8318253496367913, 5: 0.9181470155699245, 6: 0.8482388132592638, 7: 0.8754037746581883, 8: 0.9856032906764169, 9: 0.5882102030370243, 10: 0.7085893887507623, 11: 0.8667369901547116, 12: 0.23346303501945526}
0.9573675130805994 0.9716597876215944
0.9644607042078566 0.9687672958731742 0.9601922324263614
[0.97455647 0.93685101 0.95801129 0.       