# Training The Model

### Objectives

#### 1. Read and Convert Training Data Features from .csv file to NumPy Array
#### 2. Create a Full Matrix to Represent the Data from Sparse Matrix
#### 3. Training the Naive Bayes Model
#### 4. Calculating the Probablity of Each Word Belonging to a Particular Class
#### 5. Saving the Trained Model
#### 6. Prepare the Test Data
#### 7. Prepare the Validation Data

## Notebook Imports

In [45]:
import pandas as pd
import numpy as np

## File Paths

In [46]:
#------- Files Added in Previous Step

TRAINING_SPARSE_MATRIX_TXT_PATH = 'Training_Data/training_sparse_matrix.txt'
TESTING_SPARSE_MATRIX_TXT_PATH = 'Testing_Data/testing_sparse_matrix.txt'
TESTING_WORD_ID_FILE_PATH = 'Training_Data/testing_vocab_ids.csv'
VALIDATION_SPARSE_MATRIX_TXT_PATH = 'Validation_Data/Validation_Data_Sparse_Matrix.txt'
VALIDATION_CLASS_SPARSE_MATRIX_TXT_PATH = 'Validation_Data/Validation_Class_Sparse_Matrix.txt'
WORD_ID_FILE_PATH = 'Training_Data/vocab_ids.csv'


#------ Files Added in Current Step

TESTING_DATA_FULL_MATRIX = 'Testing_Data/testing_full_matrix.txt'
VALIDATION_DATA_FULL_MATRIX =  'Validation_Data/Validation_Data_Full_Matrix.txt'
PREAMBLE_WORD_PROB = 'Trained_Model/preamble_word_prob.txt'
LENDER_DEFAULTING_WORD_PROB = 'Trained_Model/lender_defaulting_word_prob.txt' 
GOVERNING_LAW_WORD_PROB = 'Trained_Model/governing_law_word_prob.txt' 
INDEMNIFICATION_WORD_PROB = 'Trained_Model/indemnification_word_prob.txt'
OTHER_WORD_PROB = 'Trained_Model/other_word_prob.txt'
WORD_PROB = 'Trained_Model/word_prob.txt'
PROB_CLASS_PATH = 'Trained_Model/prob_class.txt'

## Constants

In [47]:
Preamble_classification_id = 0
Lender_defaulting_classification_id = 1
Governing_law_classification_id = 2
Indemnification_classification_id = 3
Other_classification_id = 4

## Read and Convert Training Data Features from .csv file to NumPy Array

In [48]:
training_data_sparse_matrix = np.loadtxt(TRAINING_SPARSE_MATRIX_TXT_PATH, delimiter=' ',dtype= int)
training_data_sparse_matrix

array([[  0,   0,   0,   2],
       [  0,   2,   0,   2],
       [  0,   3,   0,   1],
       ...,
       [ 20, 167,   4,   1],
       [ 20, 186,   4,   1],
       [ 20, 204,   4,   1]])

In [49]:
testing_data_sparse_matrix = np.loadtxt(TESTING_SPARSE_MATRIX_TXT_PATH, delimiter=' ', dtype=int)
testing_data_sparse_matrix[:10]     # ------ Displaying first 10 values

array([[ 0,  0,  3],
       [ 0,  2,  2],
       [ 0,  3,  4],
       [ 0,  4,  1],
       [ 0,  5,  1],
       [ 0,  6,  2],
       [ 0, 14,  1],
       [ 0, 22,  2],
       [ 0, 24,  1],
       [ 0, 40,  1]])

In [50]:
validation_data_sparse_matrix = np.loadtxt(VALIDATION_SPARSE_MATRIX_TXT_PATH, delimiter=' ',dtype = int)
validation_data_sparse_matrix[:10]

array([[ 0,  0,  1],
       [ 0,  2,  1],
       [ 0,  3,  2],
       [ 0,  5,  1],
       [ 0,  6,  1],
       [ 0, 15,  2],
       [ 0, 23,  1],
       [ 0, 27,  1],
       [ 0, 40,  1],
       [ 0, 63,  1]])

In [51]:
training_vocab = pd.read_csv(WORD_ID_FILE_PATH)
testing_vocab = pd.read_csv(TESTING_WORD_ID_FILE_PATH)

## Function to create a Full Matrix from Sparse Matrix

In [52]:
def make_training_full_matrix(sparse_matrix, nr_words, doc_idx=0, word_idx=1, cat_idx=2, freq_idx=3):
    column_names = ['DOC_ID'] + ['CATEGORY'] + list(range(0,len(training_vocab) ))
    doc_id_names = np.unique(sparse_matrix[:, 0])
    full_matrix = pd.DataFrame(index=doc_id_names, columns=column_names)
    full_matrix.fillna(value=0, inplace=True)
    
    for i in range(sparse_matrix.shape[0]):
        doc_nr = sparse_matrix[i][doc_idx]
        word_id = sparse_matrix[i][word_idx]
        label = sparse_matrix[i][cat_idx]
        occurrence = sparse_matrix[i][freq_idx]
        
        full_matrix.at[doc_nr, 'DOC_ID'] = doc_nr
        full_matrix.at[doc_nr, 'CATEGORY'] = label
        full_matrix.at[doc_nr, word_id] = occurrence
    
    full_matrix.set_index('DOC_ID', inplace=True)
    return full_matrix
    

In [53]:
training_data_full_matrix = make_training_full_matrix(training_data_sparse_matrix, len(training_vocab))
training_data_full_matrix.head()                           # ------ The columns of the table represent the different words
                                                    # ------ in that statement and their value gives their occurence 

Unnamed: 0_level_0,CATEGORY,0,1,2,3,4,5,6,7,8,...,200,201,202,203,204,205,206,207,208,209
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2,0,2,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,1,0,1,2,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,2,1,1,0,0,0,0,0,1,...,0,0,0,1,0,0,1,0,0,0


In [54]:
training_data_word_occurence = training_data_full_matrix.loc[:,training_data_full_matrix.columns != 'CATEGORY']   
training_data_word_occurence .head()               # ----- Removing Category Column from the Matrix

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,200,201,202,203,204,205,206,207,208,209
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2,0,2,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1,0,1,2,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,0,1,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,1,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0,1,0,0,0


## Calculating the total Number of Words in Different Class to Use in Bayes Probability Formula

In [55]:
statement_lengths = training_data_word_occurence.sum(axis=1)
statement_lengths.head()              # ---- axis = 1 - symbolizes along the columns 

DOC_ID
0    38
1    38
2    20
3    20
4    17
dtype: int64

In [56]:
total_no_of_words = statement_lengths.sum()
print(f'Total Number of Words in all the Statements = {total_no_of_words}')

Total Number of Words in all the Statements = 432


In [57]:
preamble_lengths =statement_lengths[training_data_full_matrix.CATEGORY == Preamble_classification_id]
preamble_word_count = preamble_lengths.sum()
prob_preamble = len(preamble_lengths) / len(statement_lengths)
print(f'The Total Number of Words in Preamble = {preamble_word_count}')
print(f'The Probablity of a Statement to be a Preamble = {prob_preamble}')
preamble_lengths


The Total Number of Words in Preamble = 116
The Probablity of a Statement to be a Preamble = 0.19047619047619047


DOC_ID
0    38
1    38
2    20
3    20
dtype: int64

In [58]:
lender_defaulting_lengths = statement_lengths[training_data_full_matrix.CATEGORY == Lender_defaulting_classification_id]
lender_defaulting_word_count = lender_defaulting_lengths.sum()
prob_lender_defaulting = len(lender_defaulting_lengths) / len(statement_lengths)
print(f'The Total Number of Words in Lender Defaulting = {lender_defaulting_word_count}')
print(f'The Probablity of a Statement to be Lender Defaulting = {prob_lender_defaulting}')
lender_defaulting_lengths

The Total Number of Words in Lender Defaulting = 71
The Probablity of a Statement to be Lender Defaulting = 0.19047619047619047


DOC_ID
4    17
5    14
6    25
7    15
dtype: int64

In [59]:
governing_law_lengths = statement_lengths[training_data_full_matrix.CATEGORY == Governing_law_classification_id]
governing_law_word_count = governing_law_lengths.sum()
prob_governing_law = len(governing_law_lengths) / len(statement_lengths)
print(f'The Probablity of a Statement to be a Governing Law = {prob_governing_law}')
print(f'The Total Number of Words in Governing Law = {governing_law_word_count}')
governing_law_lengths

The Probablity of a Statement to be a Governing Law = 0.19047619047619047
The Total Number of Words in Governing Law = 88


DOC_ID
8      9
9     17
10    25
11    37
dtype: int64

In [60]:
indemnification_lengths = statement_lengths[training_data_full_matrix.CATEGORY == Indemnification_classification_id ]
indemnification_word_count = indemnification_lengths.sum()
prob_indemnification =  len(indemnification_lengths) / len(statement_lengths)
print(f'The Probablity of a Statement to be an Indemnification = {prob_indemnification}')
print(f'The Total Number of Words in Indemnification = {indemnification_word_count}')
indemnification_lengths

The Probablity of a Statement to be an Indemnification = 0.23809523809523808
The Total Number of Words in Indemnification = 77


DOC_ID
12     8
13    20
14    11
15    12
16    26
dtype: int64

In [61]:
other_lengths = statement_lengths[training_data_full_matrix.CATEGORY == Other_classification_id ]
other_word_count = other_lengths.sum()
prob_other =  len(other_lengths) / len(statement_lengths)
print(f'The Probablity of a Statement to be Other = {prob_other}')
print(f'The Total Number of Words in Other = {other_word_count}')
other_lengths

The Probablity of a Statement to be Other = 0.19047619047619047
The Total Number of Words in Other = 80


DOC_ID
17    22
18    20
19    20
20    18
dtype: int64

## Calculating and Storing the Probability of Each Class

In [62]:
prob_list=[]
prob_list.append(prob_preamble)
prob_list.append(prob_lender_defaulting)
prob_list.append(prob_governing_law)
prob_list.append(prob_indemnification)
prob_list.append(prob_other)

np.savetxt(PROB_CLASS_PATH,prob_list)

## Calculating the Count of the Each Word in Preamble

In [63]:
preamble_count_each_word= training_data_word_occurence.loc[training_data_full_matrix.CATEGORY==Preamble_classification_id].sum(axis=0) + 1
preamble_count_each_word
# ---- Output is in the format
# ---- A - The Number of time the Word having Index A Occurs in a Preamble
# ---- 1 is added to the count for Laplace Correction in Probability

0      6
1      1
2      6
3      6
4      1
      ..
205    2
206    1
207    1
208    1
209    1
Length: 210, dtype: int64

## P( Word | Preamble ) - Probability that a Word Occurs given the Statement is Preamble

In [64]:
prob_word_preamble = preamble_count_each_word / (preamble_word_count + len(training_vocab) ) 
prob_word_preamble    # ---- len(training_vocab) added for Laplace Correction

0      0.018405
1      0.003067
2      0.018405
3      0.018405
4      0.003067
         ...   
205    0.006135
206    0.003067
207    0.003067
208    0.003067
209    0.003067
Length: 210, dtype: float64

## Calculating the Count of the Each Word in Lender Defaulting

In [65]:
lender_defaulting_count_each_word= training_data_word_occurence.loc[training_data_full_matrix.CATEGORY==Lender_defaulting_classification_id].sum(axis=0) + 1
lender_defaulting_count_each_word

0      10
1       4
2       2
3       3
4       2
       ..
205     1
206     2
207     1
208     1
209     1
Length: 210, dtype: int64

## P( Word | Lender Defaulting ) - Probability that a Word Occurs given the Statement is Lender Defaulting

In [66]:
prob_word_lender_defaulting = lender_defaulting_count_each_word / (lender_defaulting_word_count + len(training_vocab) ) 
prob_word_lender_defaulting    # ---- len(training_vocab) added for Laplace Correction

0      0.035587
1      0.014235
2      0.007117
3      0.010676
4      0.007117
         ...   
205    0.003559
206    0.007117
207    0.003559
208    0.003559
209    0.003559
Length: 210, dtype: float64

## Calculating the Count of the Each Word in Governing Law

In [67]:
governing_law_count_each_word= training_data_word_occurence.loc[training_data_full_matrix.CATEGORY==Governing_law_classification_id].sum(axis=0) + 1
governing_law_count_each_word

0      1
1      7
2      8
3      1
4      8
      ..
205    1
206    1
207    1
208    2
209    2
Length: 210, dtype: int64

## P( Word | Governing Law ) - Probability that a Word Occurs given the Statement is Governing Law

In [68]:
prob_word_governing_law = governing_law_count_each_word / (governing_law_word_count + len(training_vocab) ) 
prob_word_governing_law   # ---- len(training_vocab) added for Laplace Correction

0      0.003356
1      0.023490
2      0.026846
3      0.003356
4      0.026846
         ...   
205    0.003356
206    0.003356
207    0.003356
208    0.006711
209    0.006711
Length: 210, dtype: float64

## Calculating the Count of the Each Word in Indemnification

In [69]:
indemnification_count_each_word= training_data_word_occurence.loc[training_data_full_matrix.CATEGORY==Indemnification_classification_id].sum(axis=0) + 1
indemnification_count_each_word

0      3
1      4
2      1
3      5
4      1
      ..
205    1
206    1
207    1
208    1
209    1
Length: 210, dtype: int64

## P( Word | Indemnification ) - Probability that a Word Occurs given the Statement is Indemnification

In [70]:
prob_word_indemnification = indemnification_count_each_word / (indemnification_word_count + len(training_vocab) ) 
prob_word_indemnification  # ---- len(training_vocab) added for Laplace Correction

0      0.010453
1      0.013937
2      0.003484
3      0.017422
4      0.003484
         ...   
205    0.003484
206    0.003484
207    0.003484
208    0.003484
209    0.003484
Length: 210, dtype: float64

## Calculating the Count of the Each Word in Other

In [71]:
other_count_each_word= training_data_word_occurence.loc[training_data_full_matrix.CATEGORY==Other_classification_id].sum(axis=0) + 1
other_count_each_word

0      1
1      3
2      1
3      2
4      2
      ..
205    1
206    1
207    2
208    1
209    1
Length: 210, dtype: int64

## P( Word | Other ) - Probability that a Word Occurs given the Statement is Other

In [72]:
prob_word_other = other_count_each_word / (other_word_count + len(training_vocab) ) 
prob_word_other  # ---- len(training_vocab) added for Laplace Correction

0      0.003448
1      0.010345
2      0.003448
3      0.006897
4      0.006897
         ...   
205    0.003448
206    0.003448
207    0.006897
208    0.003448
209    0.003448
Length: 210, dtype: float64

## P(Word) - Probability that Word Occurs

In [73]:
prob_words = training_data_word_occurence.sum(axis=0) / total_no_of_words
prob_words

0      0.037037
1      0.032407
2      0.030093
3      0.027778
4      0.020833
         ...   
205    0.002315
206    0.002315
207    0.002315
208    0.002315
209    0.002315
Length: 210, dtype: float64

## Save the Trained Model

In [74]:
np.savetxt(PREAMBLE_WORD_PROB, prob_word_preamble)
np.savetxt(LENDER_DEFAULTING_WORD_PROB , prob_word_lender_defaulting )
np.savetxt(GOVERNING_LAW_WORD_PROB , prob_word_governing_law )
np.savetxt(INDEMNIFICATION_WORD_PROB ,prob_word_indemnification )
np.savetxt(OTHER_WORD_PROB ,prob_word_other )
np.savetxt(WORD_PROB, prob_words)

## Test Data Preparation

In [75]:
def make_testing_full_matrix(sparse_matrix, nr_words, doc_idx=0, word_idx=1, freq_idx=2):
    column_names = ['DOC_ID']  + list(range(0,len(training_vocab) ))
    doc_id_names = np.unique(sparse_matrix[:, 0])
    full_matrix = pd.DataFrame(index=doc_id_names, columns=column_names)
    full_matrix.fillna(value=0, inplace=True)
    
    for i in range(sparse_matrix.shape[0]):
        doc_nr = sparse_matrix[i][doc_idx]
        word_id = sparse_matrix[i][word_idx]
        occurrence = sparse_matrix[i][freq_idx]
        
        full_matrix.at[doc_nr, 'DOC_ID'] = doc_nr
        full_matrix.at[doc_nr, word_id] = occurrence
    
    full_matrix.set_index('DOC_ID', inplace=True)
    return full_matrix
    

In [76]:
testing_data_full_matrix = make_testing_full_matrix(testing_data_sparse_matrix, nr_words=len(training_vocab))

In [77]:
testing_data_full_matrix

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,200,201,202,203,204,205,206,207,208,209
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3,0,2,4,1,1,2,0,0,0,...,0,0,0,3,0,0,0,0,0,0
1,0,0,0,0,3,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,0,0,2,0,0,2,0,0,1,...,1,0,0,0,0,0,0,0,0,0
4,1,0,0,2,0,1,2,0,0,2,...,0,0,0,0,0,0,0,0,0,0
5,0,1,1,0,1,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,1
6,0,1,0,2,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1,0,4,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,4,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [78]:
np.savetxt(TESTING_DATA_FULL_MATRIX, testing_data_full_matrix)

## Validation Data Preparation

In [79]:
validation_data_full_matrix = make_testing_full_matrix(validation_data_sparse_matrix, nr_words=len(training_vocab))
validation_data_full_matrix

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,200,201,202,203,204,205,206,207,208,209
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,0,1,2,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0,1,0,0,0
2,0,1,0,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,0
3,0,1,1,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,1,0,1,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0


In [80]:
np.savetxt(VALIDATION_DATA_FULL_MATRIX, validation_data_full_matrix)