<a href="https://colab.research.google.com/github/Titashmkhrj/Credit-card-fraud-detection/blob/master/Credit_card_fraud_HPO_MS_Testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# importing the required libraries
import numpy as np
import pandas as pd
import joblib

import imblearn 
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from sklearn.linear_model import (LogisticRegression, PassiveAggressiveClassifier, RidgeClassifier)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (RandomizedSearchCV, train_test_split, cross_val_score)

from sklearn.metrics import accuracy_score


# models as per the sequence in the parameter grid 
model_objects = [LogisticRegression(),
                 LogisticRegression(),
								 LogisticRegression(),
								 PassiveAggressiveClassifier(),
								 RidgeClassifier(),
								 KNeighborsClassifier(),
								 SVC(),
								 DecisionTreeClassifier(),
								 RandomForestClassifier()]



# hyoer-parameter dictionary for the tunningof the models
parameter_grid = {'LR_l1' : {'model__penalty' : ['l1'],
                              'model__C' : [0.001, 0.01, 0.1, 1, 10, 100],
                              'model__random_state' : [42],
                              'model__solver' : ['liblinear', 'saga'],
                              'model__max_iter' : [100000]
                          },
				
                  'LR_l2' : {'model__penalty' : ['l2'],
                              'model__C' : [0.001, 0.01, 0.1, 1, 10, 100],
                              'model__random_state' : [42],
                              'model__solver' : ['newton-cg', 'lbfgs', 'sag', 'saga'],
                              'model__max_iter' : [100000]
                          },

                  'LR_ElNet' : {'model__penalty' : ['elasticnet'],
                                'model__l1_ratio' : [0.3, 0.5, 0.7],
                                'model__C' : [0.001, 0.01, 0.1, 1, 10, 100],
                                'model__random_state' : [42],
                                'model__solver' : ['saga'],
                                'model__max_iter' : [100000]
                              },

                  'Pass_Agg_clif' : {'model__c' : [0.001, 0.01, 0.1, 1, 10, 100],
                                      'model__fit_intercept' : ['True', 'False'],
                                      'model__random_state' : [42],
                                      'model__loss' : ['hinge', 'squared_hinge'],
                                      'model__class_weight' : ['balanced', None]
                                  },
                  
                  'Ridge_clif' : {'model__alpha' : [500.0, 50.0, 5.0, 0.5, 0.05, 0.005],
                                  'model__fit_intercept' : ['True', 'False'],
                                  'model__normalize' : ['True', 'False'],
                                  'model__class_weight' : ['balanced', None],
                                  'model__solver' : ['svd', 'cholesky', 'lsqr', 'sparse_cg']
                              },
                  
                  'KN_classif' : {'model__n_neighbor' : [2,4,6,8,10,],
                                  'model__p' : [2,3,5]                     
                              },
                  
                  'SVC' : {'model__c' : [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                           'model__gamma' : ['scale', 'auto'],
                           'model__class_weight' : ['balanced', None]                      
                      },
                  
                  'DT_clif' : {'model__criterion': ['gini','entropy'],
                                'model__max_features': ['sqrt','log2',None],
                                'model__min_samples_leaf': [1,2,5,10],
                                'model__min_samples_split' : [1,2,5,10,15,100],
                                'model__max_depth': [5,8,15,25,30,None]
                          },
                  
                  'RF_clif' : {'model__n_estimators' : [120,300,500,800,1200],
                               'model__max_features': ['sqrt','log2',None],
                                'model__min_samples_leaf': [1,2,5,10],
                                'model__min_samples_split' : [1,2,5,10,15,100],
                                'model__max_depth': [5,8,15,25,30,None]                      
                          }
              }



# reading the feature and target spaces for our project
x_data = pd.read_csv('/content/drive/My Drive/data_for_HPO&MS/credit_card_fraud/feature_space.csv')
y_data = pd.read_csv('/content/drive/My Drive/data_for_HPO&MS/credit_card_fraud/target_space.csv')
# dropping an unnecessary column from our target space
y_data.drop('Unnamed: 0', axis=1, inplace=True)


# splitting our dataset into train, validation and test sets
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.3, random_state = 42)
x_optimization, x_validation, y_optimization, y_vaildation = train_test_split(x_train, y_train, test_size = 0.3, random_state = 42)


print('Tunning the hyper-parameter...............')

# initiating an empty list for storing the optimized models
hyper_parameter_optimized_models = []

# maing the objects for our hyper parameter search  and model selection pipelines
over_sampler = SMOTE(random_state=42)
scaler = StandardScaler()

# initiating the random search
for grid, model in zip(parameter_grid.values(), model_objects) :
  classif_model = Pipeline([('resampler', over_sampler), ('scaler', scaler), ('model', model)]),
  optimizer = RandomizedSearchCV(estimator = classif_model,
								param_distributions = grid,
								random_state = 42,
								cv = 3,
								error_score = -1,
								verbose = 10
								)
  optimizer.fit(x_optimization, y_optimization)
	# appending the best estimator to a list
  hyper_parameter_optimized_models.append(optimizer.best_estimator_)

print('Hyper parameter tunning is finished.')

# initiating the model selection proess
print('Model selection .........')

# initiating an empty list to stre the validation scores of the optimized models
optimized_model_validation_scores = []

for optimized_model in hyper_parameter_optimized_models :
  optimized_model_pipeline = Pipeline([('resampler', over_sampler), ('scaler', scaler), ('optimized_model', optimized_model)])
  model_validation_scores = cross_val_score(optimized_model_pipeline, x_validation, y_validation.ravel(), cv=3)
  optimized_model_validation_scores.append(np.mean(model_validation_scores))

# making a dictionary to store the results of the hyper-parameter optimization and the model selection process.
results_dict = {'optimized_model':hyper_parameter_optimized_models,
								'validation_score':optimized_model_validation_scores
						}

optimized_model_results = pd.DataFrame(results_dict)
# saving the results of the hyper-parameter optimization and model_selection in a csv file
optimized_model_results.to_csv('/content/drive/My Drive/data_for_HPO&MS/credit_card_fraud/model_optimizaion_report.csv')
print('Model selection is finished')


print('Initiating the process of our final phase to judge the average out-of-sample performance of our best found optimized model.')
# selectin gthe best model by its index for the final predictions
best_model_idx = optimized_model_results['validation_score'].idxmax(axis=0)
best_model = optimized_model_results.iloc[best_model_idx,0]

print('The best model to our finding is ', best_model)

'''
                                                      -------------------------------
                                                              FINAL PREDICTION
                                                      -------------------------------
we are utilizing the whole training dataset for this purpose.
resampling our training datasets, in order to prevent overfitting of our models on the majority class of the target feature in our training set
'''

x_train_resampled, y_train_resampled = SMOTE(random_state=42).fit_resample(x_train, y_train)
# scaling our features in the training dataset
scaler = StandardScaler().fit(x_train_resampled)
x_train_scaled = scaler.transform(x_train_resampled)
x_test_scaled = scaler.transform(x_test)
# re-fitting out best found optimized model to the whole training set
best_model.fit(x_train_scaled, y_train_resampled)
out_of_sample_predictions = best_model.predict(x_test_scaled)

final_score = accuracy_score(y_test, out_of_sample_predictions)

print('The final average out-of-sample performance score of our best optimized model is', final_score)

# saving our best found optimized model for this data, as a pickle file
joblib.dump(best_model, '/content/drive/My Drive/data_for_HPO&MS/credit_card_fraud/best_model.pkl') 

Tunning the hyper-parameter...............


TypeError: ignored