## Welcome to the Training_Model notebook

Goals of this notebook:
* Train a Model to classify suspicious firms as such, given their data

###### The Auditor Office of India, officially known as the Comptroller and Auditor General of India (CAG), is a Central Government Agency established by the Indian Constitution with the express purpose of ensuring public funds are being used properly (making sure the gov't isn't being scammed). It does this by auditing Central and State Government accounts.

## Import necessary libraries

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import numpy as np

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression


## Load the Dataset

The dataset is the 'Audit Data' dataset from the UCI Machine Learning Repository

In [9]:
ad = pd.read_csv('../audit_data/audit_risk.csv') # 'ad' for 'audit data'
td = pd.read_csv('../audit_data/trial.csv') # 'td' for 'trial data'

In [10]:
# Basic Visualization

print(ad.columns) # to see the different columns that ad and td have
print(td.columns)

ad.sample(10)


Index(['Sector_score', 'LOCATION_ID', 'PARA_A', 'Score_A', 'Risk_A', 'PARA_B',
       'Score_B', 'Risk_B', 'TOTAL', 'numbers', 'Score_B.1', 'Risk_C',
       'Money_Value', 'Score_MV', 'Risk_D', 'District_Loss', 'PROB', 'RiSk_E',
       'History', 'Prob', 'Risk_F', 'Score', 'Inherent_Risk', 'CONTROL_RISK',
       'Detection_Risk', 'Audit_Risk', 'Risk'],
      dtype='object')
Index(['Sector_score', 'LOCATION_ID', 'PARA_A', 'SCORE_A', 'PARA_B', 'SCORE_B',
       'TOTAL', 'numbers', 'Marks', 'Money_Value', 'MONEY_Marks', 'District',
       'Loss', 'LOSS_SCORE', 'History', 'History_score', 'Score', 'Risk'],
      dtype='object')


Unnamed: 0,Sector_score,LOCATION_ID,PARA_A,Score_A,Risk_A,PARA_B,Score_B,Risk_B,TOTAL,numbers,...,RiSk_E,History,Prob,Risk_F,Score,Inherent_Risk,CONTROL_RISK,Detection_Risk,Audit_Risk,Risk
578,55.57,6,2.24,0.6,1.344,7.26,0.6,4.356,9.5,5.0,...,0.4,0,0.2,0.0,3.6,7.148,0.4,0.5,1.4296,1
219,2.72,22,3.22,0.6,1.932,55.64,0.6,33.384,58.86,5.0,...,0.8,0,0.2,0.0,4.2,85.596,0.8,0.5,34.2384,1
688,55.57,11,0.74,0.2,0.148,0.0,0.2,0.0,0.74,5.0,...,0.4,0,0.2,0.0,2.0,1.69,0.4,0.5,0.338,0
773,55.57,14,0.24,0.2,0.048,0.04,0.2,0.008,0.28,5.0,...,0.4,0,0.2,0.0,2.0,1.456,0.4,0.5,0.2912,0
399,1.85,8,0.0,0.2,0.0,0.0,0.2,0.0,0.0,5.0,...,0.8,0,0.2,0.0,2.6,8.118,0.8,0.5,3.2472,1
99,3.89,16,6.47,0.6,3.882,12.18,0.6,7.308,18.65,5.0,...,1.2,0,0.2,0.0,4.4,28.552,1.2,0.5,17.1312,1
162,3.41,13,5.61,0.6,3.366,1.56,0.2,0.312,7.17,5.0,...,0.4,0,0.2,0.0,2.4,5.884,0.4,0.5,1.1768,1
586,55.57,4,0.26,0.2,0.052,0.21,0.2,0.042,0.47,5.0,...,0.4,0,0.2,0.0,2.0,1.504,0.4,0.5,0.3008,0
273,2.37,11,1.63,0.4,0.652,5.86,0.6,3.516,7.49,5.0,...,0.4,0,0.2,0.0,3.4,6.306,0.4,0.5,1.2612,1
756,55.57,25,0.0,0.2,0.0,0.0,0.2,0.0,0.0,5.0,...,0.4,0,0.2,0.0,2.0,1.4,0.4,0.5,0.28,0


## Prepare 'ad' and 'td' for PCA

Removing incompatible values

In [None]:
#Attempting PCA on ad to prepare it for logistic regression

from IPython.display import display #display module for jupyter notebook, allowing me to manually force Jupyter to display in the nice UI form

#INVESTGATING 'ad' FOR NaN values / READYING IT FOR PCA
print('Investigating \'ad\' for NaN values and readying it for pca:\n')

ad.loc[ad.LOCATION_ID.isin(['LOHARU', 'NUH', 'SAFIDON'])] #find the row index numbers where these strings are in LOCATION_ID

#print(ad.head(10))
ad_stringless = ad.copy().drop([351,355,367], axis=0)
print(f'Shape of the original ad is {ad.shape}')
print(f'Shape after removing the 3 rows with Loharu Nuh and Safidon is {ad_stringless.shape}')
pd.set_option('display.max_columns', None)
#print(ad_stringless.isna().any(axis=0)) # checks if there's an NA in any column, and if so, labels that column as "True"
display(ad_stringless[ad_stringless.isna().any(axis=1)]) # returns the rows with at least 1 box that is NA, in the nice UI form

pd.reset_option('display.max_columns')

ad_stringless = ad_stringless.dropna()
print(f'Shape after dropping na is {ad_stringless.shape}')

Investigating 'ad' for NaN values and readying it for pca:

Shape of the original ad is (776, 27)
Shape after removing the 3 rows with Loharu Nuh and Safidon is (773, 27)


Unnamed: 0,Sector_score,LOCATION_ID,PARA_A,Score_A,Risk_A,PARA_B,Score_B,Risk_B,TOTAL,numbers,Score_B.1,Risk_C,Money_Value,Score_MV,Risk_D,District_Loss,PROB,RiSk_E,History,Prob,Risk_F,Score,Inherent_Risk,CONTROL_RISK,Detection_Risk,Audit_Risk,Risk
642,55.57,4,0.23,0.2,0.046,0.0,0.2,0.0,0.23,5.0,0.2,1.0,,0.2,0.0,2,0.2,0.4,0,0.2,0.0,2.0,1.446,0.4,0.5,0.2892,0


Shape after dropping na is (772, 27)


"\n# PCA SECTION - USING PCA TO FIND HOW IMPORTANT EACH COLUMN IS\nprint('Pca section - using pca to find the relative importance(explained variance) of each column:\n')\n\n# PCA and Pandas are already imported\n\npca = PCA()\n\npca.fit(ad_stringless)\nad_explained_variance_ratio = pca.explained_variance_ratio_ # must be ratio, otherwise it's just absolute values that don't really mean much to us\n\nprint('Audit Data (ad) columns and explained variance:\n')\nprint(ad.columns + '\n')\nnp.set_printoptions(precision=4, suppress=True) \nprint(ad_explained_variance_ratio)\n"

In [None]:
#Attempting PCA on td to prepare it for logistic regression

from IPython.display import display #display module for jupyter notebook, allowing me to manually force Jupyter to display in the nice UI form

#INVESTGATING 'td' FOR NaN values / READYING IT FOR PCA
print('Investigating \'td\' for NaN values and readying it for pca:\n')

td.loc[td.LOCATION_ID.isin(['LOHARU', 'NUH', 'SAFIDON'])] #find the row index numbers where these strings are in LOCATION_ID

#print(td.head(10))
td_stringless = td.copy().drop([351,355,367], axis=0)
print(f'Shape of the original td is {td.shape}')
print(f'Shape after removing the 3 rows with Loharu Nuh and Safidon is {td_stringless.shape}')
pd.set_option('display.max_columns', None)
#print(td_stringless.isna().any(axis=0)) # checks if there's an NA in any column, and if so, labels that column as "True"
display(td_stringless[td_stringless.isna().any(axis=1)]) # returns the rows with at least 1 box that is NA, in the nice UI form

pd.reset_option('display.max_columns')

td_stringless = td_stringless.dropna()
print(f'Shape after dropping na is {td_stringless.shape}')

Investigating 'td' for NaN values and readying it for pca:

Shape of the original td is (776, 18)
Shape after removing the 3 rows with Loharu Nuh and Safidon is (773, 18)


Unnamed: 0,Sector_score,LOCATION_ID,PARA_A,SCORE_A,PARA_B,SCORE_B,TOTAL,numbers,Marks,Money_Value,MONEY_Marks,District,Loss,LOSS_SCORE,History,History_score,Score,Risk
642,55.57,4,0.23,2,0.0,2,0.23,5.0,2,,2,2,0,2,0,2,2.0,0


Shape after dropping na is (772, 18)


"\n# PCA SECTION - USING PCA TO FIND HOW IMPORTANT EACH COLUMN IS\nprint('Pca section - using pca to find the relative importance(explained variance) of each column:\n')\n\n# PCA and Pandas are already imported\n\npca = PCA()\n\npca.fit(td_stringless)\ntd_explained_variance_ratio = pca.explained_variance_ratio_ # must be ratio, otherwise it's just absolute values that don't really mean much to us\n\nprint('Trial Data (td) columns and explained variance:\n')\nprint(td.columns + '\n')\nnp.set_printoptions(precision=4, suppress=True) \nprint(td_explained_variance_ratio)\n"

## Notes about PCA preparation above, done to ad and td

1. In both 'ad' and 'td', there are 3 rows where the LOCATION_ID is string, so it is not a float, and hence not compatible with pca. Hence, they were removed.
2. In 'ad', for some reason, in row 642, 'Money_Value' is NaN. So, I removed the row.

## Train logistic regression models

On ad and td both, separately (the columns in td are not a proper subset of ad's columns, so both must be modeled separately)

In [13]:
# Training a logistic regression model on ad (with and without PCA)

from sklearn.metrics import f1_score

# Create input and output + train_test_split
ad_input_data = ad_stringless.drop('Risk', axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(ad_input_data, ad_stringless.Risk, random_state=0)

# Standardize Data (for efficient logistic regression)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Logistic Regression without PCA
lg_ad = LogisticRegression()
lg_ad.fit(X_train, Y_train)

print('Pre-PCA Metrics')
# Accuracy
print('Classification accuracy: ' + str(lg_ad.score(X_test, Y_test)))
#F1 score
Y_pred = lg_ad.predict(X_test)
f1 = f1_score(Y_test, Y_pred)
print('F1 Score:', f1)


# Logistic Regression with PCA
pca = PCA(n_components = 0.9, random_state = 0)
pca.fit(X_train)
X_train_PCA = pca.transform(X_train)
X_test_PCA = pca.transform(X_test)

# Logistic Regression
pca_lg_ad = LogisticRegression()
pca_lg_ad.fit(X_train_PCA, Y_train)

print('\nPost-PCA Metrics')
print('Number of dimensions before PCA: ' + str(len(ad_input_data.columns)))
print('Number of dimensions after PCA: ' + str(pca.n_components_))
print('Classification accuracy after PCA: ' + str(pca_lg_ad.score(X_test_PCA, Y_test)))
#F1 score
Y_pred = pca_lg_ad.predict(X_test_PCA)
f1 = f1_score(Y_test, Y_pred)
print('F1 Score after PCA:', f1)

Pre-PCA Metrics
Classification accuracy: 0.9844559585492227
F1 Score: 0.9795918367346939

Post-PCA Metrics
Number of dimensions before PCA: 26
Number of dimensions after PCA: 9
Classification accuracy after PCA: 0.9896373056994818
F1 Score after PCA: 0.9864864864864865


In [14]:
# Training a logistic regression model on td (with and without PCA)

from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline # to make it easier

# Create input and output + train_test_split
td_input_data = td_stringless.drop('Risk', axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(td_input_data, td_stringless.Risk, random_state=0)

# Pipeline without PCA
pipe = Pipeline([('scaler', StandardScaler()),
                 ('logistic', LogisticRegression())])

pipe.fit(X_train, Y_train)

#Metrics
print('Pre-PCA Metrics')
print('Classification accuracy: ' + str(pipe.score(X_test, Y_test))) # Accuracy
Y_pred = pipe.predict(X_test) #F1 score calc
f1 = f1_score(Y_test, Y_pred)
print('F1 Score:', f1) #F1 score


# Pipeline with PCA
pca_pipe = Pipeline([('scaler', StandardScaler()),
                 ('pca', PCA(n_components=0.9, random_state=0)),
                 ('logistic', LogisticRegression())])

pca_pipe.fit(X_train, Y_train)

#Metrics
print('\nPost-PCA Metrics')
print('Number of dimensions before PCA: ' + str(len(td_input_data.columns)))
print('Number of dimensions after PCA: ' + str(pca_pipe.named_steps['pca'].n_components_))
print('Classification accuracy after PCA: ' + str(pca_pipe.score(X_test, Y_test)))  # Accuracy
Y_pred = pca_pipe.predict(X_test) #F1 score calc
f1 = f1_score(Y_test, Y_pred)
print('F1 Score after PCA:', f1) #F1 score


Pre-PCA Metrics
Classification accuracy: 0.9896373056994818
F1 Score: 0.9915966386554622

Post-PCA Metrics
Number of dimensions before PCA: 17
Number of dimensions after PCA: 9
Classification accuracy after PCA: 0.9844559585492227
F1 Score after PCA: 0.9873417721518988


## Final Results

Results of Logistic Regression Model Training:
1. PCA reduces features to 9 components in both
2. For 'ad', PCA improves F1 score and classification accuracy
3. For 'td', PCA does not improve F1 score and classification accuracy
4. The effect of a 90% PCA seems to be marginal, either way (accuracy going up or down)
5. Removing standardization improves Pre-PCA metrics slightly, but significantly hurts Post-PCA metrics (for both ad and td)

Steps were:
1. Get the data PCA-ready (done in the previous step)
2. Standardize the data
3. PCA(or not) the data with 90% variance retained
4. Train a logistic regression model
5. Metrics
* Accuracy and F1 score
* How many dimension PCA removed, if PCA was done