### Importing dataset

In [1]:
import pandas as pd
df = pd.read_csv('ccFraud.csv')
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
1,50000,1,1,2,37,0,0,0,0,0,...,19394,19619,20024,2500,1815,657,1000,1000,800,0
2,500000,1,1,2,29,0,0,0,0,0,...,542653,483003,473944,55000,40000,38000,20239,13750,13770,0
3,100000,2,2,2,23,0,-1,-1,0,0,...,221,-159,567,380,601,0,581,1687,1542,0
4,140000,2,3,1,28,0,0,2,0,0,...,12211,11793,3719,3329,0,432,1000,1000,1000,0


### EDA

In [2]:
print(df.shape)
print(df.isnull().sum())
df.info()

(1001, 24)
LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default payment next month    0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      -------------

No null values and all columns are in numerical data types

### PreProcessing

In [3]:
# Splitting the datsets
from sklearn.model_selection import train_test_split
x = df.drop(labels=['default payment next month'], axis=1)
y = df['default payment next month']
X_train, X_test, y_train, y_test=train_test_split(x,y,test_size=0.25,random_state=52)

In [4]:
# Scaling the training set
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
x_train = ss.fit_transform(X_train)
x_test = ss.transform(X_test)

### Model Training

In [5]:
from sklearn.metrics import accuracy_score
def score(test, pred):
    score = accuracy_score(test,pred)*100
    return print('The accuracy of the model is ',score)

from sklearn.model_selection import GridSearchCV
def ht(model, pg, x, y):
    gs=GridSearchCV(model, param_grid=pg, cv=10)
    gs.fit(x,y)
    print('The best parameters are:\n',gs.best_estimator_)

In [6]:
# Using GaussianNB
from sklearn.naive_bayes import GaussianNB
m1 = GaussianNB()
m1.fit(x_train,y_train)
y_pred = m1.predict(x_test)
score(y_test,y_pred)

param_grid = {"var_smoothing": [1e-9,0.1, 0.001, 0.5,0.05,0.01,1e-8,1e-7,1e-6,1e-10,1e-11]}
ht(model=m1, pg=param_grid, x=x_train, y=y_train)

The accuracy of the model is  41.03585657370518
The best parameters are:
 GaussianNB(var_smoothing=0.5)


In [7]:
model1 = GaussianNB(var_smoothing=0.5)
model1.fit(x_train, y_train)
y_pred = model1.predict(x_test)
score(y_test, y_pred)

The accuracy of the model is  75.2988047808765


In [8]:
# Using xgboost
from xgboost import XGBClassifier
m2 = XGBClassifier()
m2.fit(x_train, y_train)
y_pred = m2.predict(x_test)
score(y_test,y_pred)

param_grid = { "n_estimators": [5,10,20,40,80], "max_depth": range(2,5,1), 'learning_rate': [0.1, 0.2, 0.4, 0.8]}
ht(model=m2, pg=param_grid, x=x_train, y=y_train)

The accuracy of the model is  80.0796812749004
The best parameters are:
 XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.8, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=2, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=5, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)


In [10]:
model2 = XGBClassifier(n_estimators=5,learning_rate=0.8, max_depth=2)
model2.fit(x_train, y_train)
y_pred = model2.predict(x_test)
score(y_test, y_pred)

The accuracy of the model is  80.0796812749004


In [11]:
# Using RandomForest
from sklearn.ensemble import RandomForestClassifier
m3 = RandomForestClassifier()
m3.fit(x_train, y_train)
y_pred = m3.predict(x_test)
score(y_test,y_pred)

param_grid = {"n_estimators": [5,10,20,40,80],
              'criterion': ['gini', 'entropy'],
              'max_depth': [None, 1, 2, 4, 8, 16, 32],
              'min_samples_split': [2, 5, 10, 15],
              'min_samples_leaf': [1, 2, 4, 8]}
ht(model=m3, pg=param_grid, x=x_train, y=y_train)

The accuracy of the model is  81.67330677290838
The best parameters are:
 RandomForestClassifier(max_depth=8, min_samples_leaf=2, min_samples_split=5,
                       n_estimators=80)


In [13]:
model3 = RandomForestClassifier(n_estimators=80, max_depth=8, min_samples_split=5, min_samples_leaf=2)
model3.fit(x_train, y_train)
y_pred = model3.predict(x_test)
score(y_test, y_pred)

The accuracy of the model is  81.67330677290838


So, we are going to use Random Forest Classifier will be used