In [1]:
import pandas as pd  #: Importing pandas library for data manipulation and analysis
import numpy as np  #: Importing numpy library for numerical operations
from sklearn import model_selection  #: Importing model_selection module from scikit-learn for model selection
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score  #: Importing evaluation metrics from scikit-learn
from sklearn.ensemble import RandomForestClassifier  #: Importing RandomForestClassifier from scikit-learn for random forest algorithm
from sklearn.model_selection import RandomizedSearchCV  #: Importing RandomizedSearchCV for hyperparameter tuning
from sklearn.preprocessing import StandardScaler  #: Importing StandardScaler for data scaling
import matplotlib.pyplot as plt  #: Importing matplotlib.pyplot for data visualization
import seaborn as sns  #: Importing seaborn for advanced visualization
from scipy import stats  #: Importing stats module from scipy for statistical operations
from scipy.stats import randint  #: Importing randint for generating random integers
from sklearn.ensemble import ExtraTreesClassifier  #: Importing ExtraTreesClassifier from scikit-learn for extra trees algorithm
from sklearn.ensemble import AdaBoostClassifier  #: Importing AdaBoostClassifier from scikit-learn for AdaBoost algorithm
from sklearn import svm  #: Importing svm module from scikit-learn for support vector machines
from sklearn.svm import SVC  #: Importing SVC for support vector classification
from sklearn.neural_network import MLPClassifier  #: Importing MLPClassifier from scikit-learn for multi-layer perceptron algorithm
from sklearn.naive_bayes import GaussianNB  #: Importing GaussianNB from scikit-learn for Naive Bayes algorithm
from sklearn.model_selection import train_test_split  #: Importing train_test_split for splitting the dataset
from sklearn.neighbors import KNeighborsClassifier  #: Importing KNeighborsClassifier from scikit-learn for k-nearest neighbors algorithm
from imblearn import under_sampling, over_sampling  #: Importing under_sampling and over_sampling modules from imblearn for handling imbalanced data
from imblearn.over_sampling import SMOTE  #: Importing SMOTE for oversampling the minority class
from sklearn.linear_model import LogisticRegression



In [2]:
# Downloading the data
DataIMCO_N = pd.read_csv("IMCO_N.csv", low_memory=False)  #: Loading data from "IMCO_N.csv" into a pandas DataFrame
DataIMCO_PR = pd.read_csv("IMCO_PreRed.csv", low_memory=False)  #: Loading data from "IMCO_PreRed.csv" into a pandas DataFrame


In [3]:
# Downloading the data
DataIMCO_N = pd.read_csv("IMCO_N.csv", low_memory=False)  #: Loading data from "IMCO_N.csv" into a pandas DataFrame
DataIMCO_PR = pd.read_csv("IMCO_PreRed.csv", low_memory=False)  #: Loading data from "IMCO_PreRed.csv" into a pandas DataFrame


(667316, 47)
(667316, 61)


In [None]:
# Remove the column 'Unnamed: 0' from DataIMCO_N
DataIMCO_N = DataIMCO_N.drop(['Unnamed: 0'], axis=1)

# Remove the column 'Unnamed: 0' from DataIMCO_PR
DataIMCO_PR = DataIMCO_PR.drop(['Unnamed: 0'], axis=1)


In [None]:
# Prepare the data for the non-profit dataset
# Drop the 'Status' column from DataIMCO_N to create the feature matrix X_n
X_n = DataIMCO_N.drop('Status', axis=1)

# Extract the 'Status' column from DataIMCO_N as the target variable y_n
y_n = DataIMCO_N['Status']

# Split the non-profit dataset into training and testing sets
X_trainn, X_testn, y_trainn, y_testn = train_test_split(X_n, y_n, test_size=0.3)

# Apply SMOTE (Synthetic Minority Over-sampling Technique) to balance the non-profit dataset
smote = SMOTE(random_state=17)
X_trainn, y_trainn = smote.fit_resample(X_trainn, y_trainn)



In [4]:
# Prepare the data for the for-profit dataset
# Drop the 'Status' column from DataIMCO_PR to create the feature matrix X_r
X_r = DataIMCO_PR.drop('Status', axis=1)

# Extract the 'Status' column from DataIMCO_PR as the target variable y_r
y_r = DataIMCO_PR['Status']

# Split the for-profit dataset into training and testing sets
X_trainp, X_testp, y_trainp, y_testp = train_test_split(X_r, y_r, test_size=0.3)

# Apply SMOTE (Synthetic Minority Over-sampling Technique) to balance the for-profit dataset
smote = SMOTE(random_state=17)
X_trainp, y_trainp = smote.fit_resample(X_trainp, y_trainp)

In [None]:
# Print shape and distribution of target variables
print(DataRPS_N.shape) # Print the shape of DataRPS_N dataframe
print(DataRPS_RedFlags.shape) # Print the shape of DataRPS_RedFlags dataframe
print(y_trainp.shape) # Print the shape of y_trainp
print(y_testp.shape) # Print the shape of y_testp
print(y_trainn.shape) # Print the shape of y_trainn
print(y_testn.shape) # Print the shape of y_testn
print(y_trainn.value_counts()) # Print the value counts of y_trainn
print(y_testn.value_counts()) # Print the value counts of y_testn
print(y_trainp.value_counts()) # Print the value counts of y_trainp
print(y_testp.value_counts()) # Print the value counts of y_testp


## Random Forest

### IMCO

In [26]:
# Define the parameters for the Random Forest classifier
params = {
    'n_estimators': 4000,
    'min_samples_split': 10,
    'min_samples_leaf': 2,
    'max_features': 'sqrt',
    'max_depth': 80,
    'bootstrap': False
}

# Create an instance of the Random Forest classifier with the specified parameters
IMCO_rf_norm = RandomForestClassifier(
    n_estimators=params['n_estimators'],
    min_samples_split=params['min_samples_split'],
    min_samples_leaf=params['min_samples_leaf'],
    max_features=params['max_features'],
    max_depth=params['max_depth'],
    bootstrap=params['bootstrap']
)

# Fit the Random Forest classifier to the training data
IMCO_rf_norm.fit(X_trainn, y_trainn)

# Make predictions on the test data using the trained classifier
IMCOy_pred_rand_norm = IMCO_rf_norm.predict(X_testn)

# Print the results
print('RANDOM FOREST NORMAL DATA')
print('TEST PERFORMANCE')
print('------------------------------------------------------------')

# Print the classification report
print(classification_report(y_testn, IMCOy_pred_rand_norm))

# Print the confusion matrix
print('CONFUSION MATRIX')
print(confusion_matrix(y_testn, IMCOy_pred_rand_norm))
print('------------------------------------------------------------')

# Calculate and print the ROC AUC score
print('ROC_AUC_SCORE')
print('------------------------------------------------------------')
print(roc_auc_score(y_testn, IMCOy_pred_rand_norm))



RANDOM FOREST NORMAL DATA
TEST PERFORMANCE
------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    199882
           1       0.13      0.09      0.11       313

    accuracy                           1.00    200195
   macro avg       0.56      0.54      0.55    200195
weighted avg       1.00      1.00      1.00    200195

CONFUSION MATRIX
[[199691    191]
 [   285     28]]
------------------------------------------------------------
ROC_AUC_SCORE
------------------------------------------------------------
0.5442506526134765


In [None]:
# Calculate feature importances using the trained Random Forest classifier
feat_importances_norm = pd.Series(IMCO_rf_norm.feature_importances_, index=X_trainn.columns)

# Select the top 10 most important features and plot them in a horizontal bar chart
feat_importances_norm.nlargest(10).plot(kind='barh')

In [27]:
# Define the Random Forest classifier with specified hyperparameters
# {'n_estimators': 2500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': False}
IMCO_rf_red = RandomForestClassifier(n_estimators=4000,
                                     min_samples_split=5,
                                     min_samples_leaf=2,
                                     max_features='sqrt',
                                     max_depth=50,
                                     bootstrap=False)

# Train the Random Forest classifier using the training data
IMCO_rf_red.fit(X_trainp, y_trainp)

# Make predictions on the test data using the trained classifier
IMCOy_pred_rand_red = IMCO_rf_red.predict(X_testp)

# Print the performance metrics for the Random Forest classifier on the red flag data
print('RANDOM FOREST RED FLAG DATA')
print('TEST PERFORMANCE')
print('------------------------------------------------------------')

# Print the classification report, including precision, recall, F1-score, and support
print(classification_report(y_testp, IMCOy_pred_rand_red))

# Print the confusion matrix
print('CONFUSION MATRIX')
print(confusion_matrix(y_testp, IMCOy_pred_rand_red))
print('------------------------------------------------------------')

# Print the ROC AUC score
print('ROC_AUC_SCORE')
print('------------------------------------------------------------')
print(roc_auc_score(y_testp, IMCOy_pred_rand_red))

RANDOM FOREST RED FLAG DATA
TEST PERFORMANCE
------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    199877
           1       1.00      0.70      0.82       318

    accuracy                           1.00    200195
   macro avg       1.00      0.85      0.91    200195
weighted avg       1.00      1.00      1.00    200195

CONFUSION MATRIX
[[199877      0]
 [    96    222]]
------------------------------------------------------------
ROC_AUC_SCORE
------------------------------------------------------------
0.8490566037735849


In [None]:
# Calculate feature importances using the trained Random Forest classifier for red flag data
feat_importances_red = pd.Series(IMCO_rf_red.feature_importances_, index=X_trainp.columns)

# Select the top 10 most important features
top_10_features_red = feat_importances_red.nlargest(10)

# Plot the top 10 features in a horizontal bar plot
top_10_features_red.plot(kind='barh')


# XGBOOSTCLASSIFIER

In [37]:
import sys

!{sys.executable} -m pip install xgboost
from xgboost import XGBClassifier



In [38]:
# Create an XGBoost classifier with specified hyperparameters for normal data
IMCO_XGBC_norm = XGBClassifier(subsample=0.9,
                               n_estimators=300,
                               max_depth=11,
                               learning_rate=0.1)

# Fit the XGBoost classifier on the training data
IMCO_XGBC_norm.fit(X_trainn, y_trainn)

# Make predictions on the test data using the trained XGBoost classifier
IMCO_pred_xgbc_norm = IMCO_XGBC_norm.predict(X_testn)

# Print the performance metrics for the XGBoost classifier on the test data
print('XGBOOST CLASSIFIER NORMAL DATA')
print('TEST PERFORMANCE')
print('------------------------------------------------------------')
print(classification_report(y_testn, IMCO_pred_xgbc_norm))
print('CONFUSION MATRIX')
print(confusion_matrix(y_testn, IMCO_pred_xgbc_norm))
print('------------------------------------------------------------')
print('ROC_AUC_SCORE')
print('------------------------------------------------------------')
print(roc_auc_score(y_testn, IMCO_pred_xgbc_norm))

XGBOOST CLASSIFIER NORMAL DATA
TEST PERFORMANCE
------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    199882
           1       0.11      0.05      0.07       313

    accuracy                           1.00    200195
   macro avg       0.55      0.52      0.53    200195
weighted avg       1.00      1.00      1.00    200195

CONFUSION MATRIX
[[199762    120]
 [   298     15]]
------------------------------------------------------------
ROC_AUC_SCORE
------------------------------------------------------------
0.5236614842373615


In [39]:
# Create an XGBoost classifier with specified hyperparameters for preprocessed and red data
IMCO_XGBC_red = XGBClassifier(subsample=0.8,
                              n_estimators=300,
                              max_depth=11,
                              learning_rate=0.1)

# Fit the XGBoost classifier on the preprocessed and red data
IMCO_XGBC_red.fit(X_trainp, y_trainp)

# Make predictions on the test data using the trained XGBoost classifier
IMCO_pred_xgbc_red = IMCO_XGBC_red.predict(X_testp)

# Print the performance metrics for the XGBoost classifier on the test data
print('XGBOOST CLASSIFIER PREPROCESSED & RED DATA')
print('TEST PERFORMANCE')
print('------------------------------------------------------------')
print(classification_report(y_testp, IMCO_pred_xgbc_red))
print('CONFUSION MATRIX')
print(confusion_matrix(y_testp, IMCO_pred_xgbc_red))
print('------------------------------------------------------------')
print('ROC_AUC_SCORE')
print('------------------------------------------------------------')
print(roc_auc_score(y_testp, IMCO_pred_xgbc_red))


XGBOOST CLASSIFIER PREPROCESSED & RED DATA
TEST PERFORMANCE
------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    199877
           1       1.00      0.86      0.92       318

    accuracy                           1.00    200195
   macro avg       1.00      0.93      0.96    200195
weighted avg       1.00      1.00      1.00    200195

CONFUSION MATRIX
[[199876      1]
 [    44    274]]
------------------------------------------------------------
ROC_AUC_SCORE
------------------------------------------------------------
0.9308151085244469


#  EXTRATREESCLASSIFIER

In [40]:
# {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None}
IMCO_ETC_norm = ExtraTreesClassifier(n_estimators=500,
                                     min_samples_split=2,
                                     min_samples_leaf=1,
                                     max_depth=None)

IMCO_ETC_norm.fit(X_trainn, y_trainn)

IMCO_pred_etc_norm = IMCO_ETC_norm.predict(X_testn)

print('EXTRATREESCLASSIFIER NORMAL DATA')
print('TEST PERFORMANCE')
print('------------------------------------------------------------')
print(classification_report(y_testn, IMCO_pred_etc_norm))
print('CONFUSION MATRIX')
print(confusion_matrix(y_testn, IMCO_pred_etc_norm))
print('------------------------------------------------------------')
print('ROC_AUC_SCORE')
print('------------------------------------------------------------')
print(roc_auc_score(y_testn, IMCO_pred_etc_norm))

EXTRATREESCLASSIFIER NORMAL DATA
TEST PERFORMANCE
------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    199882
           1       0.08      0.09      0.09       313

    accuracy                           1.00    200195
   macro avg       0.54      0.55      0.54    200195
weighted avg       1.00      1.00      1.00    200195

CONFUSION MATRIX
[[199545    337]
 [   284     29]]
------------------------------------------------------------
ROC_AUC_SCORE
------------------------------------------------------------
0.5454828812258018


In [41]:
# Create an ExtraTreesClassifier with specified hyperparameters for red flags data
IMCO_ETC_red = ExtraTreesClassifier(n_estimators=500,
                                    min_samples_split=10,
                                    min_samples_leaf=1,
                                    max_depth=None)

# Fit the ExtraTreesClassifier on the red flags data
IMCO_ETC_red.fit(X_trainp, y_trainp)

# Make predictions on the test data using the trained ExtraTreesClassifier
IMCO_pred_etc_red = IMCO_ETC_red.predict(X_testp)

# Print the performance metrics for the ExtraTreesClassifier on the test data
print('EXTRATREESCLASSIFIER REDFLAGS DATA')
print('TEST PERFORMANCE')
print('------------------------------------------------------------')
print(classification_report(y_testp, IMCO_pred_etc_red))
print('CONFUSION MATRIX')
print(confusion_matrix(y_testp, IMCO_pred_etc_red))
print('------------------------------------------------------------')
print('ROC_AUC_SCORE')
print('------------------------------------------------------------')
print(roc_auc_score(y_testp, IMCO_pred_etc_red))


EXTRATREESCLASSIFIER REDFLAGS DATA
TEST PERFORMANCE
------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    199877
           1       1.00      0.64      0.78       318

    accuracy                           1.00    200195
   macro avg       1.00      0.82      0.89    200195
weighted avg       1.00      1.00      1.00    200195

CONFUSION MATRIX
[[199877      0]
 [   116    202]]
------------------------------------------------------------
ROC_AUC_SCORE
------------------------------------------------------------
0.8176100628930818


# MULTILAYER PERCEPTRON

In [42]:
# Create an MLPClassifier with specified hyperparameters for normal data
IMCO_MLP_norm = MLPClassifier(solver='adam',
                              max_iter=300,
                              learning_rate='adaptive',
                              hidden_layer_sizes=(100,),
                              alpha=0.001,
                              activation='relu')

# Fit the MLPClassifier on the normal data
IMCO_MLP_norm.fit(X_trainn, y_trainn)

# Make predictions on the test data using the trained MLPClassifier
IMCO_pred_mlpc_norm = IMCO_MLP_norm.predict(X_testn)

# Print the performance metrics for the MLPClassifier on the test data
print('MLP NORMAL DATA')
print('TEST PERFORMANCE')
print('------------------------------------------------------------')
print(classification_report(y_testn, IMCO_pred_mlpc_norm))
print('CONFUSION MATRIX')
print(confusion_matrix(y_testn, IMCO_pred_mlpc_norm))
print('------------------------------------------------------------')
print('ROC_AUC_SCORE')
print('------------------------------------------------------------')
print(roc_auc_score(y_testn, IMCO_pred_mlpc_norm))


MLP NORMAL DATA
TEST PERFORMANCE
------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.79      0.88    199882
           1       0.00      0.61      0.01       313

    accuracy                           0.79    200195
   macro avg       0.50      0.70      0.44    200195
weighted avg       1.00      0.79      0.88    200195

CONFUSION MATRIX
[[157092  42790]
 [   122    191]]
------------------------------------------------------------
ROC_AUC_SCORE
------------------------------------------------------------
0.6980736685762811


In [43]:
# Create an MLPClassifier with specified hyperparameters for red flags data
IMCO_MLP_red = MLPClassifier(solver='adam',
                             max_iter=300,
                             learning_rate='adaptive',
                             hidden_layer_sizes=(100, 50),
                             alpha=0.001,
                             activation='logistic')

# Fit the MLPClassifier on the red flags data
IMCO_MLP_red.fit(X_trainp, y_trainp)

# Make predictions on the test data using the trained MLPClassifier
IMCO_pred_mlpc_red = IMCO_MLP_red.predict(X_testp)

# Print the performance metrics for the MLPClassifier on the test data
print('MLP REDFLAGS DATA')
print('TEST PERFORMANCE')
print('------------------------------------------------------------')
print(classification_report(y_testp, IMCO_pred_mlpc_red))
print('CONFUSION MATRIX')
print(confusion_matrix(y_testp, IMCO_pred_mlpc_red))
print('------------------------------------------------------------')
print('ROC_AUC_SCORE')
print('------------------------------------------------------------')
print(roc_auc_score(y_testp, IMCO_pred_mlpc_red))


MLP REDFLAGS DATA
TEST PERFORMANCE
------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.27      0.43    199877
           1       0.00      0.87      0.00       318

    accuracy                           0.27    200195
   macro avg       0.50      0.57      0.22    200195
weighted avg       1.00      0.27      0.43    200195

CONFUSION MATRIX
[[ 54617 145260]
 [    40    278]]
------------------------------------------------------------
ROC_AUC_SCORE
------------------------------------------------------------
0.5737334435520612


# NAIVE BAYES

In [44]:
# Create a GaussianNB classifier with specified hyperparameters for normal data
IMCO_NBclass_norm = GaussianNB(var_smoothing=8.111308307896872e-05)

# Fit the GaussianNB classifier on the normal data
IMCO_NBclass_norm.fit(X_trainn, y_trainn)

# Make predictions on the test data using the trained GaussianNB classifier
IMCO_pred_nbclass_norm = IMCO_NBclass_norm.predict(X_testn)

# Print the performance metrics for the GaussianNB classifier on the test data
print('NAIVE BAYES CLASSIFIER NORMAL DATA')
print('TEST PERFORMANCE')
print('------------------------------------------------------------')
print(classification_report(y_testn, IMCO_pred_nbclass_norm))
print('CONFUSION MATRIX')
print(confusion_matrix(y_testn, IMCO_pred_nbclass_norm))
print('------------------------------------------------------------')
print('ROC_AUC_SCORE')
print('------------------------------------------------------------')
print(roc_auc_score(y_testn, IMCO_pred_nbclass_norm))


NAIVE BAYES CLASSIFIER NORMAL DATA
TEST PERFORMANCE
------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.01      0.02    199882
           1       0.00      0.98      0.00       313

    accuracy                           0.01    200195
   macro avg       0.50      0.50      0.01    200195
weighted avg       1.00      0.01      0.02    200195

CONFUSION MATRIX
[[  2398 197484]
 [     6    307]]
------------------------------------------------------------
ROC_AUC_SCORE
------------------------------------------------------------
0.4964138746013502


In [45]:
# Create a GaussianNB classifier with specified hyperparameters for red flag data
IMCO_NBclass_red = GaussianNB(var_smoothing=2.848035868435799e-08)

# Fit the GaussianNB classifier on the red flag data
IMCO_NBclass_red.fit(X_trainp, y_trainp)

# Make predictions on the test data using the trained GaussianNB classifier
IMCO_pred_nbclass_red = IMCO_NBclass_red.predict(X_testp)

# Print the performance metrics for the GaussianNB classifier on the test data
print('NAIVE BAYES CLASSIFIER NORMAL DATA')
print('TEST PERFORMANCE')
print('------------------------------------------------------------')
print(classification_report(y_testp, IMCO_pred_nbclass_red))
print('CONFUSION MATRIX')
print(confusion_matrix(y_testp, IMCO_pred_nbclass_red))
print('------------------------------------------------------------')
print('ROC_AUC_SCORE')
print('------------------------------------------------------------')
print(roc_auc_score(y_testp, IMCO_pred_nbclass_red))


NAIVE BAYES CLASSIFIER NORMAL DATA
TEST PERFORMANCE
------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.10      0.18    199877
           1       0.00      0.97      0.00       318

    accuracy                           0.10    200195
   macro avg       0.50      0.54      0.09    200195
weighted avg       1.00      0.10      0.18    200195

CONFUSION MATRIX
[[ 20219 179658]
 [     8    310]]
------------------------------------------------------------
ROC_AUC_SCORE
------------------------------------------------------------
0.537999989490392


# KNeighborsClassifier

In [47]:
# Create a KNeighborsClassifier with specified hyperparameters for normal data
IMCO_knc_norm = KNeighborsClassifier(weights='uniform',
                                n_neighbors=1,
                                metric='euclidean')

# Fit the KNeighborsClassifier on the normal data
IMCO_knc_norm.fit(X_trainn, y_trainn)

# Make predictions on the test data using the trained KNeighborsClassifier
IMCO_pred_knc_norm = IMCO_knc_norm.predict(X_testn)

# Print the performance metrics for the KNeighborsClassifier on the test data
print('NEAREST NEIGHBOR CLASSIFIER NORMAL DATA')
print('TEST PERFORMANCE')
print('------------------------------------------------------------')
print(classification_report(y_testn, IMCO_pred_knc_norm))
print('CONFUSION MATRIX')
print(confusion_matrix(y_testn, IMCO_pred_knc_norm))
print('------------------------------------------------------------')
print('ROC_AUC_SCORE')
print('------------------------------------------------------------')
print(roc_auc_score(y_testn, IMCO_pred_knc_norm))


NEAREST NEIGHBOR CLASSIFIER NORMAL DATA
TEST PERFORMANCE
------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.83      0.91    199882
           1       0.00      0.34      0.01       313

    accuracy                           0.83    200195
   macro avg       0.50      0.59      0.46    200195
weighted avg       1.00      0.83      0.91    200195

CONFUSION MATRIX
[[166706  33176]
 [   206    107]]
------------------------------------------------------------
ROC_AUC_SCORE
------------------------------------------------------------
0.5879375540834268


In [48]:
# Create a KNeighborsClassifier with specified hyperparameters for red flags data
IMCO_knc_red = KNeighborsClassifier(weights='uniform',
                                n_neighbors=1,
                                metric='euclidean')

# Fit the KNeighborsClassifier on the red flags data
IMCO_knc_red.fit(X_trainp, y_trainp)

# Make predictions on the test data using the trained KNeighborsClassifier
IMCO_pred_knc_red = IMCO_knc_red.predict(X_testp)

# Print the performance metrics for the KNeighborsClassifier on the test data
print('NEAREST NEIGHBOR CLASSIFIER RED FLAGS DATA')
print('TEST PERFORMANCE')
print('------------------------------------------------------------')
print(classification_report(y_testp, IMCO_pred_knc_red))
print('CONFUSION MATRIX')
print(confusion_matrix(y_testp, IMCO_pred_knc_red))
print('------------------------------------------------------------')
print('ROC_AUC_SCORE')
print('------------------------------------------------------------')
print(roc_auc_score(y_testp, IMCO_pred_knc_red))


NEAREST NEIGHBOR CLASSIFIER NORMAL DATA
TEST PERFORMANCE
------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.99      1.00    199877
           1       0.15      0.81      0.25       318

    accuracy                           0.99    200195
   macro avg       0.57      0.90      0.62    200195
weighted avg       1.00      0.99      0.99    200195

CONFUSION MATRIX
[[198416   1461]
 [    61    257]]
------------------------------------------------------------
ROC_AUC_SCORE
------------------------------------------------------------
0.9004333026446486


# LOGISTIC REGRESSION

In [7]:
# {'solver': 'newton-cg', 'penalty': 'l2', 'max_iter': 500, 'C': 100}

IMCO_LR_norm = LogisticRegression(solver='newton-cg',
                        penalty='l2',
                        max_iter=500,
                        C=100)

# Fit the LogisticRegression on the normal data
IMCO_LR_norm.fit(X_trainn, y_trainn)

# Predict the target variable for the test data using the trained model
IMCO_pred_lr_norm = IMCO_LR_norm.predict(X_testn)

# Print the performance metrics for the LogisticRegression on the normal data
print('MLPC RED FLAGS DATA')
print('TEST PERFORMANCE')
print('------------------------------------------------------------')

# Print the classification report
print(classification_report(y_testn, IMCO_pred_lr_norm))

# Print the confusion matrix
print('CONFUSION MATRIX')
print(confusion_matrix(y_testn, IMCO_pred_lr_norm))
print('------------------------------------------------------------')

# Print the ROC AUC score
print('ROC_AUC_SCORE')
print('------------------------------------------------------------')
print(roc_auc_score(y_testn, IMCO_pred_lr_norm))

Fitting 3 folds for each of 5 candidates, totalling 15 fits




              precision    recall  f1-score   support

           0       0.99      1.00      0.99    198255
           1       0.24      0.04      0.07      1940

    accuracy                           0.99    200195
   macro avg       0.62      0.52      0.53    200195
weighted avg       0.98      0.99      0.99    200195

MLPC RED FLAGS DATA
{'solver': 'newton-cg', 'penalty': 'l2', 'max_iter': 500, 'C': 100}
TEST PERFORMANCE
------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    199846
           1       0.04      0.24      0.07       349

    accuracy                           0.99    200195
   macro avg       0.52      0.62      0.53    200195
weighted avg       1.00      0.99      0.99    200195

CONFUSION MATRIX
[[197991   1855]
 [   264     85]]
------------------------------------------------------------
ROC_AUC_SCORE
-----------------------------------------------------

In [8]:
# {'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 500, 'C': 10}

IMCO_LR_red = LogisticRegression(solver='liblinear',
                        penalty='l1',
                        max_iter=500,
                        C=10)

IMCO_LR_red.fit(X_trainp, y_trainp)

IMCO_pred_rand_rf = IMCO_LR_red.predict(X_testp)


print('MLPC RED FLAGS DATA')
print('TEST PERFORMANCE')
print('------------------------------------------------------------')
print(classification_report(y_testp, IMCO_pred_rand_rf))
print('CONFUSION MATRIX')
print(confusion_matrix(y_testp, IMCO_pred_rand_rf))
print('------------------------------------------------------------')
print('ROC_AUC_SCORE')
print('------------------------------------------------------------')
print(roc_auc_score(y_testp, IMCO_pred_rand_rf))

Fitting 3 folds for each of 5 candidates, totalling 15 fits
              precision    recall  f1-score   support

           0       0.95      1.00      0.98    190945
           1       0.50      0.02      0.04      9250

    accuracy                           0.95    200195
   macro avg       0.73      0.51      0.51    200195
weighted avg       0.93      0.95      0.93    200195

MLPC RED FLAGS DATA
{'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 500, 'C': 10}
TEST PERFORMANCE
------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.95      0.98    199833
           1       0.02      0.50      0.04       362

    accuracy                           0.95    200195
   macro avg       0.51      0.73      0.51    200195
weighted avg       1.00      0.95      0.97    200195

CONFUSION MATRIX
[[190765   9068]
 [   180    182]]
------------------------------------------------------------
ROC_AUC_