## Extraction and comparison of noun chunks and verbs with manually annotated noun chunks and verbs.

In [None]:
#!pip install textacy
import os
import spacy
##import textacy
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import RegexpParser
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 563 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# For dataset.

In [None]:
import pandas as pd
import numpy as np

In [None]:
df1 = pd.read_csv('/content/manually-annotated-scienceDataset1.csv',index_col=False)

In [None]:
df1.head()

Unnamed: 0,no,sentence,n(list),v(list)
0,1,There are six main nutrients in our food : Car...,"There, six main nutrients, our food,Carbohydra...",are
1,2,These nutrients are present in the different f...,"These nutrients, the different food items, we,...","are present, eat"
2,3,Our main need is of energy.,"Our main need , energy",is
3,4,Let us learn more about some of them,"us, some , them","let, learn"
4,5,It is fulfilled by carbohydrates,"It , carbohydrates",is fulfilled


In [None]:
df1['v(list)'].fillna("not_a_verb",inplace=True)

In [None]:
# Function to extract the verbs
def verbExtractor(text):
    verbs = []  
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        #words = [word for word in words if word not in set(stopwords.words('english'))]     not used in our case.
        #print(words)                       for debugging purpose
        tagged = nltk.pos_tag(words)
        #print(tagged)                      for debugging purpose
        for (word, val) in tagged:                
            if (val == 'VBP' or val == 'VB' or val == 'VBG' or val == 'VBD' or val == 'VBZ' or val == 'VBN'):         #Extracting verb if the word is a verb
                verbs.append(word)
    return verbs 

In [None]:
listofnouns = []
listofverbs = []
nlp = spacy.load("en_core_web_sm")
for i in df1['sentence']:
  listofsentence = []
  doc =nlp(i)
  for np in doc.noun_chunks:
    listofsentence.append(np.text)
  listofverbs.append(verbExtractor(i))
  listofnouns.append(listofsentence)


In [None]:
listofnouns

[['six main nutrients',
  'our food',
  'Carbohydrates',
  'fats',
  'proteins',
  'fibre',
  'vitamins',
  'minerals'],
 ['These nutrients',
  'the different food items',
  'we',
  'differing proportions'],
 ['Our main need', 'energy'],
 ['us', 'some', 'them'],
 ['It', 'carbohydrates'],
 ['our diet',
  'a large proportion',
  'cereals',
  'the form',
  'rice',
  'chapatti',
  'roti',
  'bhakri',
  'bread',
  'which',
  'carbohydrates'],
 ['We',
  'proteins',
  'the purposes',
  'growth',
  'the wear',
  'tear',
  'the body',
  'other life processes'],
 ['We',
  'proteins',
  'the sprouts',
  'milk',
  'milk',
  'products',
  'meat',
  'eggs',
  'our food'],
 ['Some part',
  'our energy requirement',
  'fatty foods',
  'oil',
  'ghee',
  'butter'],
 ['We', 'energy', 'the form', 'heat', 'the food', 'we'],
 ['Heat', 'kilocalories'],
 ['the energy', 'food items', 'kilocalories', 'Calories'],
 ['Minerals',
  'vitamins',
  "the body's resistance",
  'disease',
  'other life',
  'the body',


In [None]:
listofverbs

[['are'],
 ['are', 'eat', 'differing'],
 ['is'],
 ['Let', 'learn'],
 ['is', 'fulfilled'],
 ['includes', 'contain'],
 ['need', 'repairing'],
 ['get'],
 ['is', 'fulfilled'],
 ['get', 'eat'],
 ['is', 'measured'],
 ['is', 'measured'],
 ['improve', 'disease', 'processes', 'needs'],
 ['obtain'],
 ['require', 'lead'],
 ['Let', 'learn'],
 ['needs'],
 ['gives', 'provides', 'caused'],
 ['is', 'strengthen'],
 ['need', 'repairing'],
 ['get'],
 ['improve', 'disease', 'processes'],
 ['obtain'],
 ['is'],
 ['is', 'fulfilled'],
 ['includes'],
 ['are'],
 ['turns', 'comes'],
 ['infer', 'is'],
 ['contain'],
 ['contain'],
 ['obtain', 'flour'],
 ['form'],
 ['get'],
 ['taste'],
 ['get', 'contains', 'called'],
 ['contain'],
 ['give'],
 ['make'],
 ['produce'],
 ['takes'],
 ['is', 'brought'],
 ['is', 'called'],
 ['give'],
 ['Living', 'are', 'made', 'called'],
 ['are', 'brought'],
 ['do', 'have', 'are'],
 ['are', 'called'],
 ['appear'],
 ['Living', 'are', 'made', 'called'],
 ['are', 'brought'],
 ['are', 'made'],

In [None]:
data = {'sentence':df1['sentence'],'n_list':listofnouns,'v_list':listofverbs}

In [None]:
new_df = pd.DataFrame(data)

In [None]:
new_df

Unnamed: 0,sentence,n_list,v_list
0,There are six main nutrients in our food : Car...,"[six main nutrients, our food, Carbohydrates, ...",[are]
1,These nutrients are present in the different f...,"[These nutrients, the different food items, we...","[are, eat, differing]"
2,Our main need is of energy.,"[Our main need, energy]",[is]
3,Let us learn more about some of them,"[us, some, them]","[Let, learn]"
4,It is fulfilled by carbohydrates,"[It, carbohydrates]","[is, fulfilled]"
...,...,...,...
230,It looks like a field covered with snow.na Fro...,"[It, a field, snow.na, these bolls, cotton, hand]","[looks, covered, is, picked]"
231,Fibres are then separated from the seeds by co...,"[Fibres, the seeds]","[are, separated, combing]"
232,This process is called ginning of cotton.,"[This process, ginning, cotton]","[is, called]"
233,Ginning was traditionally done by hand (Fig.3.7),"[Ginning, hand, (Fig.3.7]","[Ginning, was, done]"


## getting Count of noun chunks and verbs .

In [None]:
noofnouns = []
noofverbs = []
for i in listofnouns:
  noofnouns.append(len(i))
for i in listofverbs:
  noofverbs.append(len(i))

In [None]:
month = df1['n(list)'].tolist()

In [None]:
listof = []
for i in month:
  listof.append(i)

In [None]:
noofnouns_insheet = []
noofverbs_insheet = []
for i in df1['n(list)'].tolist():
  noofnouns_insheet.append(len(i.split(',')))
for i in df1['v(list)'].tolist():
   noofverbs_insheet.append(len(i.split(',')))

In [None]:
noun_nomatch = []
verb_nomatch = []
for i in range(len(noofnouns)):
  noun_nomatch.append(noofnouns_insheet[i]-noofnouns[i])
for i in range(len(noofverbs)):
  verb_nomatch.append(noofverbs_insheet[i]-noofverbs[i])

In [None]:
len(df1['n(list)'])

235

In [None]:
exact_mat_noun = []
exact_mat_verb = []
for i in range(len(df1['n(list)'])):
  exact_mat_noun.append(min(noofnouns[i],noofnouns_insheet[i]))
  exact_mat_verb.append(min(noofverbs[i],noofverbs_insheet[i]))


In [None]:
datacount = {'N(excel)':noofnouns_insheet,'N(text file)':noofnouns,'N(Exact_match)':exact_mat_noun,'N(no match)':noun_nomatch,'V(excel)':noofverbs_insheet,'V(text file)':noofverbs,'V(no match)':verb_nomatch,'V(Exact_match)':exact_mat_verb}

In [None]:
datacount_df = pd.DataFrame(datacount)

In [None]:
datacount_df

Unnamed: 0,N(excel),N(text file),N(Exact_match),N(no match),V(excel),V(text file),V(no match),V(Exact_match)
0,9,8,8,1,1,1,0,1
1,4,4,4,0,2,3,-1,2
2,2,2,2,0,1,1,0,1
3,3,3,3,0,2,2,0,2
4,2,2,2,0,1,2,-1,1
...,...,...,...,...,...,...,...,...
230,6,6,6,0,3,4,-1,3
231,2,2,2,0,2,3,-1,2
232,3,3,3,0,1,2,-1,1
233,2,3,2,-1,1,3,-2,1


In [None]:
total_N_excel = datacount_df['N(excel)'].sum()
total_text_file = datacount_df['N(text file)'].sum()
total_nomatch = datacount_df['N(no match)'].sum()
total_exa = datacount_df['N(Exact_match)'].sum()
total_V_excel = datacount_df['V(excel)'].sum()
total_V_text = datacount_df['V(text file)'].sum()
total_V_nomatch = datacount_df['V(no match)'].sum()
total_V_exa = datacount_df['V(Exact_match)'].sum()

In [None]:
per_noun = (total_exa)/total_N_excel

In [None]:
total_N_excel

1037

In [None]:
total_exa

1012

In [None]:
per_noun*100   # 97.03947368421053 for science-dataset2 # 97.58 for science Dataset1 # 93.6 for Dataset3

97.58919961427193

In [None]:
per_verb = (total_V_exa)/total_V_excel

In [None]:
per_verb*100 # 91.71597633136095  for science-dataset2 # 96.00 for science Dataset1 # 92.93 for Dataset3


96.0