In [7]:
pip install xgboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
import os 
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.preprocessing import  PolynomialFeatures, KBinsDiscretizer, FunctionTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer, OrdinalEncoder
import statsmodels.formula.api as smf
import statsmodels.tsa as tsa
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import BaggingClassifier, BaggingRegressor,RandomForestClassifier,RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor 
from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

<Figure size 600x300 with 0 Axes>

In [10]:
path = "/content/drive/MyDrive/data/creditcard.csv"
data = pd.read_csv(path)

In [11]:
Total_transactions = len(data)
normal = len(data[data.Class == 0])
fraudulent = len(data[data.Class == 1])
fraud_percentage = round(fraudulent/normal*100, 2)
print('Total number of Trnsactions are {}'.format(Total_transactions))
print('Number of Normal Transactions are {}'.format(normal))
print('Number of fraudulent Transactions are {}'.format(fraudulent))
print('Percentage of fraud Transactions is {}'.format(fraud_percentage))

Total number of Trnsactions are 284807
Number of Normal Transactions are 284315
Number of fraudulent Transactions are 492
Percentage of fraud Transactions is 0.17


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [13]:
print(min(data.Amount))
print(max(data.Amount))


0.0
25691.16


In [14]:
#Scaling the amount column
sc = StandardScaler()
amount = data['Amount'].values
data['Amount'] = sc.fit_transform(amount.reshape(-1, 1))
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.244964,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.342475,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,1.160686,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.140534,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,-0.073403,0


In [15]:
data.drop(['Time'], axis=1, inplace=True)

In [16]:
data.shape

(284807, 30)

In [17]:
data.drop_duplicates(inplace=True)

In [18]:
data.shape

(275663, 30)

In [19]:
X = data.drop('Class', axis = 1).values
y = data['Class'].values

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)

In [21]:
DT = DecisionTreeClassifier(max_depth = 4, criterion = 'entropy')
DT.fit(X_train, y_train)
dt_yhat = DT.predict(X_test)

In [24]:
print('Decision Tree Accuracy score {}'.format(accuracy_score(y_test, dt_yhat)))
print('F1 score of the Decision Tree {}'.format(f1_score(y_test, dt_yhat)))
confusion_matrix(y_test, dt_yhat, labels = [0, 1])

Decision Tree Accuracy score 0.9991583957281328
F1 score of the Decision Tree 0.7521367521367521


array([[68770,    18],
       [   40,    88]])

In [25]:
n = 7
KNN = KNeighborsClassifier(n_neighbors = n)
KNN.fit(X_train, y_train)
knn_yhat = KNN.predict(X_test)

In [27]:
print('Accuracy score of KNN {}'.format(accuracy_score(y_test, knn_yhat)))
print('F1 score of KNN {}'.format(f1_score(y_test, knn_yhat)))

Accuracy score of KNN 0.999288989494457
F1 score of KNN 0.7949790794979079


In [28]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_yhat = lr.predict(X_test)

In [29]:
print('Accuracy score of Logistic Regression {}'.format(accuracy_score(y_test, lr_yhat)))
print('F1 score of Logistic Regression {}'.format(f1_score(y_test, lr_yhat)))

Accuracy score of Logistic Regression 0.9989552498694062
F1 score of Logistic Regression 0.6666666666666666


In [30]:
svm = SVC()
svm.fit(X_train, y_train)
svm_yhat = svm.predict(X_test)

In [33]:
print('Accuracy score of SVM {}'.format(accuracy_score(y_test, svm_yhat)))
print('F1 score of SVM {}'.format(f1_score(y_test, svm_yhat)))

Accuracy score of SVM 0.999318010331418
F1 score of SVM 0.7813953488372093


In [31]:
rf = RandomForestClassifier(max_depth = 4)
rf.fit(X_train, y_train)
rf_yhat = rf.predict(X_test)

In [34]:
print('Accuracy score of Random Forest {}'.format(accuracy_score(y_test, rf_yhat)))
print('F1 score of Random Forest {}'.format(f1_score(y_test, rf_yhat)))

Accuracy score of Random Forest 0.9991293748911718
F1 score of Random Forest 0.7222222222222223


In [32]:
xgb = XGBClassifier(max_depth = 4)
xgb.fit(X_train, y_train)
xgb_yhat = xgb.predict(X_test)

In [35]:
print('Accuracy score of XGBoost {}'.format(accuracy_score(y_test, xgb_yhat)))
print('F1 score of XGBoost {}'.format(f1_score(y_test, xgb_yhat)))


Accuracy score of XGBoost 0.999506645771664
F1 score of XGBoost 0.8495575221238937


In [36]:
print("XGBoost Model is the most accurate for this case ")

XGBoost Model is the most accurate for this case 
