# Syantactic Processing - Assignment

### Imports and Prepare the Model

In [1]:
import spacy
import sklearn_crfsuite
from sklearn_crfsuite import metrics

model = spacy.load("en_core_web_sm")

In [2]:
import pandas as pd
import numpy as np

## Read the input file contents

In [3]:
base_dir = './'
with open(base_dir+'train_sent', 'r') as train_sentence_file:
  input_train_sent = train_sentence_file.readlines()

with open(base_dir+'train_label', 'r') as train_label_file:
  input_train_labels = train_label_file.readlines()

with open(base_dir+'test_sent', 'r') as test_sentence_file:
  input_test_sent = test_sentence_file.readlines()

with open(base_dir+'test_label', 'r') as test_label_file:
  input_test_labels = test_label_file.readlines()


In [4]:
input_train_sent[0:5]
input_train_labels[0:5]

['O\n', 'O\n', 'O\n', 'O\n', 'O\n']

**The Lines in each file have \n as a trailing character. This needs to be trimmed down before creating the sentences**

In [5]:
input_train_sent = [word.splitlines()[0] for word in input_train_sent]
input_train_labels = [word.splitlines()[0] for word in input_train_labels]

input_test_sent = [word.splitlines()[0] for word in input_test_sent]
input_test_labels = [word.splitlines()[0] for word in input_test_labels]

In [6]:
def get_sentences(data):
    "Returns a list of sentences by joining the input data wherever an empty string value is encountered..."
    return_data = []
    current_line = ''
    for word in data:
        if len(word) == 0 :
            return_data.append(current_line)
            current_line = ""
        else:
            if len(current_line) == 0:
                current_line = word
            else:
                current_line += ' '+ word
    return return_data

In [7]:
train_sentences = get_sentences(input_train_sent)
train_labels = get_sentences(input_train_labels)
test_sentences = get_sentences(input_test_sent)
test_labels = get_sentences(input_test_labels)

In [8]:
print("Total Sentences in the training Corpus: ", len(train_sentences))
print("Total Labels in the training Corpus: ", len(train_labels))
print("Total Sentences in the testing Corpus: ", len(test_sentences))
print("Total Labels in the testing Corpus: ", len(test_labels))


Total Sentences in the training Corpus:  2599
Total Labels in the training Corpus:  2599
Total Sentences in the testing Corpus:  1056
Total Labels in the testing Corpus:  1056


### Task-01: Print 5 sentences

#### Training sentences

In [9]:
for i in range(0,5):
    print("Sentence: ", train_sentences[i])
    print("Labels: ", train_labels[i], end="\n\n")

Sentence:  All live births > or = 23 weeks at the University of Vermont in 1995 ( n = 2395 ) were retrospectively analyzed for delivery route , indication for cesarean , gestational age , parity , and practice group ( to reflect risk status )
Labels:  O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O

Sentence:  The total cesarean rate was 14.4 % ( 344 of 2395 ) , and the primary rate was 11.4 % ( 244 of 2144 )
Labels:  O O O O O O O O O O O O O O O O O O O O O O O O O

Sentence:  Abnormal presentation was the most common indication ( 25.6 % , 88 of 344 )
Labels:  O O O O O O O O O O O O O O O

Sentence:  The `` corrected '' cesarean rate ( maternal-fetal medicine and transported patients excluded ) was 12.4 % ( 273 of 2194 ) , and the `` corrected '' primary rate was 9.6 % ( 190 of 1975 )
Labels:  O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O

Sentence:  Arrest of dilation was the most common indication in both `` c

#### Testing sentences

In [10]:
for i in range(0,5):
    print("Sentence: ", test_sentences[i])
    print("Labels: ", test_labels[i], end="\n\n")

Sentence:  Furthermore , when all deliveries were analyzed , regardless of risk status but limited to gestational age > or = 36 weeks , the rates did not change ( 12.6 % , 280 of 2214 ; primary 9.2 % , 183 of 1994 )
Labels:  O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O

Sentence:  As the ambient temperature increases , there is an increase in insensible fluid loss and the potential for dehydration
Labels:  O O O O O O O O O O O O O O O O O O O

Sentence:  The daily high temperature ranged from 71 to 104 degrees F and AFI values ranged from 1.7 to 24.7 cm during the study period
Labels:  O O O O O O O O O O O O O O O O O O O O O O O O

Sentence:  There was a significant correlation between the 2- , 3- , and 4-day mean temperature and AFI , with the 4-day mean being the most significant ( r = 0.31 , p & # 60 ; 0.001 )
Labels:  O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O

Sentence:  Fluctuations in ambient temperat

### Task-02: Extract the POS - Frequency of NOUN OR PROPN in train & test corpus

In [11]:
train_data = pd.DataFrame(columns=['sent', 'Text', 'LEMMA', 'POS', 'DEP'])
test_data = pd.DataFrame(columns=['sent', 'Text', 'LEMMA', 'POS', 'DEP'])

In [12]:
for index, each_sent in enumerate(train_sentences):
    sent = model(each_sent)
    for token in sent:
        train_data.loc[len(train_data)] = {'sent': index, 'Text': token.text, 'LEMMA': token.lemma_, 'POS' : token.pos_, 'DEP': token.dep_}

In [13]:
for index, each_sent in enumerate(test_sentences):
    sent = model(each_sent)
    for token in sent:
        test_data.loc[len(test_data)] = {'sent': index, 'Text': token.text, 'LEMMA': token.lemma_, 'POS' : token.pos_, 'DEP': token.dep_}

In [14]:
nouns_in_train_data = train_data[(train_data.POS == 'NOUN') | (train_data.POS == 'PROPN')].loc[:, ['Text', 'POS']]
nouns_in_test_data = test_data[(test_data.POS == 'NOUN') | (test_data.POS == 'PROPN')].loc[:, ['Text', 'POS']]

final_nouns_in_data= pd.concat([nouns_in_train_data, nouns_in_test_data])

In [15]:
final_nouns_in_data.groupby(by='Text').agg('count').sort_values(by='POS', ascending=False)

Unnamed: 0_level_0,POS
Text,Unnamed: 1_level_1
patients,492
treatment,281
%,247
cancer,200
therapy,175
...,...
fungicide,1
Minimal,1
Mikrozirkulation,1
fÃƒ1/4r,1


In [16]:
grouped_final_nouns_in_data = final_nouns_in_data.groupby(by='Text').agg('count').sort_values(by='POS', ascending=False).reset_index()

**Top 25 Most frequent words in the corpus (training+testing)**

In [17]:
grouped_final_nouns_in_data.head(25)

Unnamed: 0,Text,POS
0,patients,492
1,treatment,281
2,%,247
3,cancer,200
4,therapy,175
5,study,154
6,disease,142
7,cell,140
8,lung,116
9,group,94
