## Notebook Imports

In [15]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score, precision_score, f1_score, classification_report


## File Paths

In [3]:
TRAINING_DATA_JSON_FILE = 'Training_Data/Training_Data.json'
TESTING_DATA_PATH = 'Testing_Data/Testing_Data.csv'


## Constants

In [4]:
Preamble_classification_id = 0
Lender_defaulting_classification_id = 1
Governing_law_classification_id = 2
Indemnification_classification_id = 3
Other_classification_id = 4

## Load Data

In [5]:
data = pd.read_json(TRAINING_DATA_JSON_FILE)
data

Unnamed: 0,STATEMENT,CATEGORY
0,This CREDIT AGREEMENT is entered into as of No...,0
1,This agreement is dated 22 December 2014 and m...,0
2,"This TERM LOAN AGREEMENT, dated as of April 24...",0
3,This CREDIT AGREEMENT is entered into as of De...,0
4,Defaulting Lender’s right to approve or disapp...,1
5,Lender during the defaulting Period with respe...,1
6,"Any Lender defaulting, as reasonably determine...",1
7,Lender shall be deemed to be a Defaulting Lend...,1
8,This Agreement and shall be governed by and in...,2
9,The choice of South African law as the governi...,2


In [6]:
data_features = data.loc[:,data.columns != 'CATEGORY']  
data_features

Unnamed: 0,STATEMENT
0,This CREDIT AGREEMENT is entered into as of No...
1,This agreement is dated 22 December 2014 and m...
2,"This TERM LOAN AGREEMENT, dated as of April 24..."
3,This CREDIT AGREEMENT is entered into as of De...
4,Defaulting Lender’s right to approve or disapp...
5,Lender during the defaulting Period with respe...
6,"Any Lender defaulting, as reasonably determine..."
7,Lender shall be deemed to be a Defaulting Lend...
8,This Agreement and shall be governed by and in...
9,The choice of South African law as the governi...


In [7]:
data_targets = data.loc[:,data.columns != 'STATEMENT']  
data_targets

Unnamed: 0,CATEGORY
0,0
1,0
2,0
3,0
4,1
5,1
6,1
7,1
8,2
9,2


## Creating a Vocabulary

In [8]:
vectorizer = CountVectorizer(stop_words='english')

In [9]:
all_features = vectorizer.fit_transform(data.STATEMENT)
vectorizer.vocabulary_

{'credit': 64,
 'agreement': 21,
 'entered': 95,
 'november': 166,
 '23': 6,
 '2010': 1,
 'dkin': 84,
 'finance': 104,
 'corp': 60,
 'delaware': 76,
 'corporation': 61,
 'borrower': 35,
 'effectiveness': 91,
 'joinder': 136,
 'dkn': 85,
 'brands': 37,
 'holdings': 120,
 'assumption': 31,
 'lender': 143,
 'time': 232,
 'party': 173,
 'hereto': 117,
 'clays': 47,
 'bank': 32,
 'plc': 180,
 'administrative': 16,
 'agent': 19,
 'swingline': 224,
 'issuer': 135,
 'dated': 67,
 '22': 5,
 'december': 70,
 '2014': 2,
 'parties': 172,
 'xyz': 242,
 'gold': 112,
 'mining': 163,
 'company': 50,
 'limited': 149,
 'subsidiaries': 222,
 'listed': 151,
 'schedule': 212,
 'original': 170,
 'guarantors': 115,
 'absa': 9,
 'nedbank': 164,
 'coordinators': 59,
 'financial': 105,
 'institutions': 132,
 'ii': 121,
 'mandated': 157,
 'lead': 141,
 'arrangers': 29,
 'lenders': 144,
 'term': 229,
 'loan': 152,
 'april': 27,
 '24': 7,
 '2020': 4,
 'pepo': 174,
 'defined': 74,
 'jpc': 137,
 'scotia': 213,
 'doc

In [13]:
x_train, x_test, y_train, y_test = train_test_split(
                        all_features, data.CATEGORY,
                test_size = 0.30, random_state = 101)

## Training and Testing the Model

In [14]:
classifier = MultinomialNB()
classifier.fit(x_train, y_train)

MultinomialNB()

In [16]:
predictions = classifier.predict(x_test)
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       0.50      1.00      0.67         1
           3       1.00      1.00      1.00         1
           4       1.00      0.50      0.67         2

    accuracy                           0.86         7
   macro avg       0.88      0.88      0.83         7
weighted avg       0.93      0.86      0.86         7



## HyperParameter Optimization Using Grid Search

In [17]:
param_grid ={
    "alpha":[1.0,10.0,100.0,1000.0], "class_prior":[None], "fit_prior":[True]
} 

grid = GridSearchCV(MultinomialNB(), param_grid, refit = True, verbose = 3, cv=2)
 
grid.fit(x_train, y_train)



Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV 1/2] END ....alpha=1.0, class_prior=None, fit_prior=True; total time=   0.0s
[CV 2/2] END ....alpha=1.0, class_prior=None, fit_prior=True; total time=   0.0s
[CV 1/2] END ...alpha=10.0, class_prior=None, fit_prior=True; total time=   0.0s
[CV 2/2] END ...alpha=10.0, class_prior=None, fit_prior=True; total time=   0.0s
[CV 1/2] END ..alpha=100.0, class_prior=None, fit_prior=True; total time=   0.0s
[CV 2/2] END ..alpha=100.0, class_prior=None, fit_prior=True; total time=   0.0s
[CV 1/2] END .alpha=1000.0, class_prior=None, fit_prior=True; total time=   0.0s
[CV 2/2] END .alpha=1000.0, class_prior=None, fit_prior=True; total time=   0.0s




GridSearchCV(cv=2, estimator=MultinomialNB(),
             param_grid={'alpha': [1.0, 10.0, 100.0, 1000.0],
                         'class_prior': [None], 'fit_prior': [True]},
             verbose=3)

In [18]:

grid_predictions = grid.predict(x_test)
print(classification_report(y_test, grid_predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       0.50      1.00      0.67         1
           3       1.00      1.00      1.00         1
           4       1.00      0.50      0.67         2

    accuracy                           0.86         7
   macro avg       0.88      0.88      0.83         7
weighted avg       0.93      0.86      0.86         7

