# Syantactic Processing - Assignment

### Imports and Prepare the Model

In [1]:
!pip install sklearn-crfsuite



In [2]:
import spacy
import sklearn_crfsuite
from sklearn_crfsuite import metrics

model = spacy.load("en_core_web_sm")

In [3]:
import pandas as pd
import numpy as np

## Read the input file contents

In [4]:
base_dir = './'
with open(base_dir+'train_sent', 'r') as train_sentence_file:
  input_train_sent = train_sentence_file.readlines()

with open(base_dir+'train_label', 'r') as train_label_file:
  input_train_labels = train_label_file.readlines()

with open(base_dir+'test_sent', 'r') as test_sentence_file:
  input_test_sent = test_sentence_file.readlines()

with open(base_dir+'test_label', 'r') as test_label_file:
  input_test_labels = test_label_file.readlines()


In [5]:
input_train_sent[0:5]
input_train_labels[0:5]

['O\n', 'O\n', 'O\n', 'O\n', 'O\n']

**The Lines in each file have \n as a trailing character. This needs to be trimmed down before creating the sentences**

In [6]:
input_train_sent = [word.splitlines()[0].lower() for word in input_train_sent]
input_train_labels = [word.splitlines()[0] for word in input_train_labels]

input_test_sent = [word.splitlines()[0].lower() for word in input_test_sent]
input_test_labels = [word.splitlines()[0] for word in input_test_labels]

In [7]:
def get_sentences(data):
    "Returns a list of sentences by joining the input data wherever an empty string value is encountered..."
    return_data = []
    current_line = ''
    for word in data:
        if len(word) == 0 :
            return_data.append(current_line)
            current_line = ""
        else:
            if len(current_line) == 0:
                current_line = word
            else:
                current_line += ' '+ word
    return return_data

In [8]:
train_sentences = get_sentences(input_train_sent)
train_labels = get_sentences(input_train_labels)
test_sentences = get_sentences(input_test_sent)
test_labels = get_sentences(input_test_labels)

In [9]:
print("Total Sentences in the training Corpus: ", len(train_sentences))
print("Total Labels in the training Corpus: ", len(train_labels))
print("Total Sentences in the testing Corpus: ", len(test_sentences))
print("Total Labels in the testing Corpus: ", len(test_labels))


Total Sentences in the training Corpus:  2599
Total Labels in the training Corpus:  2599
Total Sentences in the testing Corpus:  1056
Total Labels in the testing Corpus:  1056


### Task-01: Print 5 sentences

#### Training sentences

In [10]:
for i in range(0,5):
    print("Sentence: ", train_sentences[i])
    print("Labels: ", train_labels[i], end="\n\n")

Sentence:  all live births > or = 23 weeks at the university of vermont in 1995 ( n = 2395 ) were retrospectively analyzed for delivery route , indication for cesarean , gestational age , parity , and practice group ( to reflect risk status )
Labels:  O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O

Sentence:  the total cesarean rate was 14.4 % ( 344 of 2395 ) , and the primary rate was 11.4 % ( 244 of 2144 )
Labels:  O O O O O O O O O O O O O O O O O O O O O O O O O

Sentence:  abnormal presentation was the most common indication ( 25.6 % , 88 of 344 )
Labels:  O O O O O O O O O O O O O O O

Sentence:  the `` corrected '' cesarean rate ( maternal-fetal medicine and transported patients excluded ) was 12.4 % ( 273 of 2194 ) , and the `` corrected '' primary rate was 9.6 % ( 190 of 1975 )
Labels:  O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O

Sentence:  arrest of dilation was the most common indication in both `` c

#### Testing sentences

In [11]:
for i in range(0,5):
    print("Sentence: ", test_sentences[i])
    print("Labels: ", test_labels[i], end="\n\n")

Sentence:  furthermore , when all deliveries were analyzed , regardless of risk status but limited to gestational age > or = 36 weeks , the rates did not change ( 12.6 % , 280 of 2214 ; primary 9.2 % , 183 of 1994 )
Labels:  O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O

Sentence:  as the ambient temperature increases , there is an increase in insensible fluid loss and the potential for dehydration
Labels:  O O O O O O O O O O O O O O O O O O O

Sentence:  the daily high temperature ranged from 71 to 104 degrees f and afi values ranged from 1.7 to 24.7 cm during the study period
Labels:  O O O O O O O O O O O O O O O O O O O O O O O O

Sentence:  there was a significant correlation between the 2- , 3- , and 4-day mean temperature and afi , with the 4-day mean being the most significant ( r = 0.31 , p & # 60 ; 0.001 )
Labels:  O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O

Sentence:  fluctuations in ambient temperat

### Task-02: Extract the POS - Frequency of NOUN OR PROPN in train & test corpus

In [12]:
train_data = pd.DataFrame(columns=['sent', 'Text', 'LEMMA', 'POS', 'DEP'])
test_data = pd.DataFrame(columns=['sent', 'Text', 'LEMMA', 'POS', 'DEP'])

In [13]:
for index, each_sent in enumerate(train_sentences):
    sent = model(each_sent)
    for token in sent:
        train_data.loc[len(train_data)] = {'sent': index, 'Text': token.text, 'LEMMA': token.lemma_, 'POS' : token.pos_, 'DEP': token.dep_}

In [14]:
for index, each_sent in enumerate(test_sentences):
    sent = model(each_sent)
    for token in sent:
        test_data.loc[len(test_data)] = {'sent': index, 'Text': token.text, 'LEMMA': token.lemma_, 'POS' : token.pos_, 'DEP': token.dep_}

In [15]:
nouns_in_train_data = train_data[(train_data.POS == 'NOUN') | (train_data.POS == 'PROPN')].loc[:, ['Text', 'POS']]
nouns_in_test_data = test_data[(test_data.POS == 'NOUN') | (test_data.POS == 'PROPN')].loc[:, ['Text', 'POS']]

final_nouns_in_data= pd.concat([nouns_in_train_data, nouns_in_test_data])

In [16]:
final_nouns_in_data.groupby(by='Text').agg('count').sort_values(by='POS', ascending=False)

Unnamed: 0_level_0,POS
Text,Unnamed: 1_level_1
patients,507
treatment,304
%,247
cancer,211
therapy,177
...,...
midline,1
midwives,1
communications,1
communication,1


In [17]:
grouped_final_nouns_in_data = final_nouns_in_data.groupby(by='Text').agg('count').sort_values(by='POS', ascending=False).reset_index()

**Top 25 Most frequent words in the corpus (training+testing)**

In [18]:
grouped_final_nouns_in_data.head(25)

Unnamed: 0,Text,POS
0,patients,507
1,treatment,304
2,%,247
3,cancer,211
4,therapy,177
5,study,174
6,disease,151
7,cell,142
8,lung,118
9,results,117


### Task-03: Define the CRF Features

#### Functions to Process the Sentences

In [19]:
def features_for_word(sentence, position, pos_tags):
    word = sentence[position]
    features = [
        'word.lower=' + word.lower(), # serves as word id
        'word[-3:]=' + word[-3:],     # last three characters
        'word[-2:]=' + word[-2:],     # last two characters
        'word.isupper=%s' % word.isupper(),  # is the word in all uppercase
        'word.isdigit=%s' % word.isdigit(),  # is the word a number
        'word.startsWithCapital=%s' % word[0].isupper(), # is the word starting with a capital letter
        'word.pos=' + pos_tags[position]
    ]
    if position > 0:
        prev_word = sentence[position-1]
        previous_word_features = [
        'prev_word.lower=' + prev_word.lower(),
        'prev_word.isupper=%s' % prev_word.isupper(),
        'prev_word.isdigit=%s' % prev_word.isdigit(),
        'prev_word.startsWithCapital=%s' % prev_word[0].isupper(),
        'prev_word.pos=' + pos_tags[position-1]]
        features.extend(previous_word_features)
    else:
        features.append('BEG')

    if (position == len(sentence)-1):
        features.append('END')

    return features

### Task-04: Compute the Features of a Sentence

In [20]:
def features_for_sentence(sentence):
    spacy_sentence = model(sentence)

    words = sentence.split()
    pos_tags = [word.pos_ for word in spacy_sentence]

    return [features_for_word(sentence, i, pos_tags) for i in range(0, len(words))]

In [21]:
features_for_sentence(train_sentences[0])[0]

['word.lower=a',
 'word[-3:]=a',
 'word[-2:]=a',
 'word.isupper=False',
 'word.isdigit=False',
 'word.startsWithCapital=False',
 'word.pos=DET',
 'BEG']

#### Functions to process the Labels

In [22]:
def labels_for_sentence(labels):
  return labels.split()

### Task-05: Extract Features' Values for the Sentence

In [23]:
X_train = [features_for_sentence(sentence) for sentence in train_sentences]
Y_train = [labels_for_sentence(labels) for labels in train_labels]

In [24]:
X_test = [features_for_sentence(sentence) for sentence in test_sentences]
Y_test = [labels_for_sentence(labels) for labels in test_labels]

In [25]:
print(X_train[0][0])

['word.lower=a', 'word[-3:]=a', 'word[-2:]=a', 'word.isupper=False', 'word.isdigit=False', 'word.startsWithCapital=False', 'word.pos=DET', 'BEG']


In [26]:
print(Y_train[0])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


### Task-06: CRF model for a custom NER application

In [27]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics

In [28]:
crf = sklearn_crfsuite.CRF(max_iterations=100, keep_tempfiles=False)

try:
    crf.fit(X_train, Y_train)
except AttributeError:
    pass

**Print Transition Details**

In [29]:
crf.transition_features_

{('O', 'O'): 0.79306,
 ('O', 'D'): 0.427092,
 ('O', 'T'): -1.201872,
 ('D', 'O'): -0.88395,
 ('D', 'D'): 3.591458,
 ('D', 'T'): -2.139198,
 ('T', 'O'): 0.258317,
 ('T', 'T'): 3.423218}

In [30]:
crf.classes_

['O', 'D', 'T']

In [31]:
crf.state_features_

{('word.lower=a', 'O'): 0.020241,
 ('word.lower=a', 'D'): -0.02736,
 ('word.lower=a', 'T'): 0.007119,
 ('word[-3:]=a', 'O'): 0.020241,
 ('word[-3:]=a', 'D'): -0.02736,
 ('word[-3:]=a', 'T'): 0.007119,
 ('word[-2:]=a', 'O'): 0.020241,
 ('word[-2:]=a', 'D'): -0.02736,
 ('word[-2:]=a', 'T'): 0.007119,
 ('word.isupper=False', 'O'): 0.77731,
 ('word.isupper=False', 'D'): -0.32984,
 ('word.isupper=False', 'T'): -0.44747,
 ('word.isdigit=False', 'O'): 0.450665,
 ('word.isdigit=False', 'D'): -0.31765,
 ('word.isdigit=False', 'T'): -0.133014,
 ('word.startsWithCapital=False', 'O'): 0.77731,
 ('word.startsWithCapital=False', 'D'): -0.32984,
 ('word.startsWithCapital=False', 'T'): -0.44747,
 ('word.pos=DET', 'O'): 0.761333,
 ('word.pos=DET', 'D'): -0.228608,
 ('word.pos=DET', 'T'): -0.532724,
 ('BEG', 'O'): 0.609883,
 ('BEG', 'D'): -0.080265,
 ('BEG', 'T'): -0.529618,
 ('word.lower=l', 'O'): -0.061103,
 ('word.lower=l', 'D'): 0.020057,
 ('word.lower=l', 'T'): 0.041046,
 ('word[-3:]=l', 'O'): -0.0

In [32]:
X_test = [features_for_sentence(sentence) for sentence in test_sentences]
Y_test = [labels_for_sentence(labels) for labels in test_labels]

In [33]:
y_pred = crf.predict(X_test)

In [34]:
pred_label=[]
for index, i in enumerate(y_pred):
    pred_label.extend(i)
    if 'T' in i:
        print(index)

### Task: 07: Calculate the F1 score

In [35]:
metrics.flat_f1_score(Y_test, y_pred, average='weighted')

0.806645227359197

In [36]:
print(len(y_pred), len(test_sentences))

1056 1056


In [37]:
pd.DataFrame({'Predicted': y_pred[45], 'Sentence': test_sentences[45].split()})

Unnamed: 0,Predicted,Sentence
0,O,these
1,O,initial
2,O,observations
3,O,are
4,O,an
5,O,encouraging
6,O,step
7,O,toward
8,O,the
9,O,description


### Task-08: Get all predicted Treatment labels
***and corresponding to each Disease label D in the test Dataaset***

In [38]:
predicted_data = pd.DataFrame(columns=['Predicted'])

In [39]:
for pred_labels in y_pred:
    predicted_data = pd.concat([predicted_data, pd.DataFrame({'Predicted': pred_labels})], axis=0)

In [40]:
overall_response = {}

In [41]:
for sent_index in range(len(y_pred)):
    current_disease = ''
    current_treatment = ''
    for label_index in range(len(y_pred[sent_index])):
        label = y_pred[sent_index][label_index]
        if label=='D':
            current_disease += test_sentences[sent_index].split()[label_index] + ' '
            #print('>>D:', current_disease)
        elif label=='T':
            current_treatment += test_sentences[sent_index].split()[label_index] + ' '
            #print('>>T:', current_treatment)

    current_disease = current_disease.strip()
    current_treatment = current_treatment.strip()

    if (len(current_disease) == 0 | len(current_treatment) == 0) :
        continue
    #print('>>D:', current_disease)
    #print('>>T:', current_treatment)
    #print('Disease Exists:', (current_disease in overall_response))
    #print('Disease:', overall_response[current_disease] )
    if current_disease in overall_response:
        overall_response[current_disease].append(current_treatment)
        #print(overall_response)
    else:
        overall_response[current_disease] = [current_treatment]
        #print(overall_response)

In [42]:
overall_response

{'nonhereditary cases': [''],
 'non-hla genetic susceptibility to ms': [''],
 'non-pregnant women were made': [''],
 'intermediate hosts': [''],
 'female stress urinary incontinence': [''],
 'patients with non-obstructive azoospermia , tefna': [''],
 'araneus diadematus cl': [''],
 'congenital adrenal hyperplasia': [''],
 'ml ) cells': [''],
 'coronary angioplasty': [''],
 'british dental surveys': [''],
 'malignant soft tissue sarcomas': [''],
 'bilateral cortical lesions': [''],
 'recovery after thrombolysis': [''],
 'early bos': [''],
 'spinal adhesive arachnoiditis': [''],
 'primary uveal melanoma': [''],
 'atrial fibrillation': [''],
 'temporomandibular joint arthropathy': [''],
 'severe secondary peritonitis': ['']}

k = pd.concat([test_data, predicted_data], axis=0)

test_data.sent.max()

t = 0
for d in Y_test:
    t += len(d)
print(t)

len(Y_test)

Y_test[-1]

len(test_sentences)

len(Y_test)

len(y_pred)

min(test_data.sent)

test_data.head()

len(test_data.Text) <3

In [43]:
response = {}
for pred_labels in y_pred:

    for each_label in pred_labels:
        match each_label:
            case 'D': #Disease
                
            case 'T': #Treatment
            case  _ :


SyntaxError: invalid syntax (73835126.py, line 5)