In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score,roc_curve,auc, roc_auc_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import precision_score, recall_score, precision_recall_fscore_support
from sklearn.metrics import f1_score,precision_recall_curve
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
credit_data= pd.read_csv("creditcard.csv")
credit_data.shape

(52605, 31)

In [3]:
X= credit_data.drop('Class', axis= 1)
y= credit_data['Class']

In [4]:
#Feature EngineeringPowerTransformer
from sklearn.preprocessing import PowerTransformer
power = PowerTransformer(method='yeo-johnson', standardize=True)
df = power.fit_transform(X)

In [5]:
df= pd.DataFrame(data= df, columns= X.columns)
from imblearn.over_sampling import SMOTE
from collections import Counter
smt = SMOTE(random_state=2425, n_jobs=-1,sampling_strategy='auto', k_neighbors=5)
X_sm, y_sm = smt.fit_resample(df, y)
print('Resampled dataset shape {}'.format(Counter(y_sm)))
print('Before Resampled dataset shape {}'.format(Counter(y)))

Resampled dataset shape Counter({0: 52452, 1: 52452})
Before Resampled dataset shape Counter({0: 52452, 1: 153})


In [6]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size = 0.30, random_state = 440)

After using Hyperparameter tunning we found best parameter for Random forest and using on it

In [7]:
RF_model=Pipeline([('scalar3',StandardScaler()),
                     ('pca3',PCA()),
                     ('rf_classifier',RandomForestClassifier(random_state=42, max_features='auto', n_estimators= 100, max_depth=8, criterion='gini', min_samples_leaf= 15))])

In [8]:
RF_model.fit(x_train, y_train)

In [9]:
pred=RF_model.predict(x_test)

In [10]:
print("Accuracy for Random Forest on CV data: ",accuracy_score(y_test,pred))

Accuracy for Random Forest on CV data:  0.989991103202847


In [11]:
# Save the Modle to file in the current working directory
import pickle
Pkl_Filename = "Model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(RF_model, file)
print("RF_model Saved")

RF_model Saved


# Now we are testing the data

In [12]:
# Load the Model back from file
Pkl_Filename = "Model.pkl"  
with open(Pkl_Filename, 'rb') as file:  
  Pickled_RF_model= pickle.load(file)

Pickled_RF_model

In [13]:
# Use the Reloaded Model to 
# Calculate the accuracy score and predict target values
# Calculate the Score 
score = Pickled_RF_model.score(x_test, y_test)  
# Print the Score
print("Test score: {0:.2f} %".format(100 * score))  

# Predict the Labels using the reloaded Model
Ypredict = Pickled_RF_model.predict(x_test)  

Ypredict

Test score: 99.00 %


array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [14]:
#predict the data using only one row

features= np.array([-0.39953817, -0.97712131,  1.24544234,  0.94905287,  1.7934207 ,
       -0.72105241,  0.71068923, -0.77640272,  1.36614126, -1.42028025,
        0.790728  ,  0.89664203, -0.02611889, -0.61222435,  1.12123451,
        1.2181234 ,  0.33202657,  0.30275497,  0.838198  ,  0.55880226,
        0.35222467,  0.51204671,  1.14737082, -0.18496812, -0.09257469,
       -0.5097014 ,  0.79919997,  0.73473953,  0.42037289, -0.29209222])

In [15]:
Ypredict = Pickled_RF_model.predict([features])  

Ypredict

array([0], dtype=int64)