#Import Required **libraries**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score
from IPython.display import display

#Read the CSV file using Pandas

In [2]:
credit_data = pd.read_csv('/content/drive/MyDrive/Project/Credit card fraud detection Kaggle/creditcard.csv')

# Performing Exploratory Data Analysis (EDA) to understand data structure and quality


Note: The dataset contains only numerical features, primarily principal components (V1–V28) derived through PCA for confidentiality reasons. The 'Time' feature represents seconds since the first transaction, while 'Amount' indicates the transaction value and can be used for cost-sensitive learning.

In [3]:
credit_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
credit_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [5]:
credit_data.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


# checking the value that refers 0 as Legit and 1 as fraud in the dataset

In [6]:
credit_data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,284315
1,492


The dataset is highly unbalanced

So seperating data to analysis

In [7]:
legit = credit_data[credit_data.Class == 0]
fraud = credit_data[credit_data.Class == 1]

In [8]:
print(legit.shape)
print(fraud.shape)

(284315, 31)
(492, 31)


In [9]:
legit.Amount.describe()

Unnamed: 0,Amount
count,284315.0
mean,88.291022
std,250.105092
min,0.0
25%,5.65
50%,22.0
75%,77.05
max,25691.16


In [10]:
fraud.Amount.describe()

Unnamed: 0,Amount
count,492.0
mean,122.211321
std,256.683288
min,0.0
25%,1.0
50%,9.25
75%,105.89
max,2125.87


In [11]:
credit_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


As we can see there is huge different between the mean of legit data and fraudulent data so by  doing unsampling by distributing randomly the dataset into equal number for both the transaction


In [12]:
legit_sample = legit.sample(n=492)

Now we will concatente both the transaction into one by using concat command from pandas to create new dataset with same number of data

In [13]:
new_dataset = pd.concat([legit_sample, fraud], axis=0) # axis = 0 means row and 1 means column and we want to add it row wise not column wise

In [14]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
19197,30088.0,-0.485407,0.605942,1.089066,0.990843,-0.266805,-0.550272,1.302104,-0.207974,-0.698755,...,0.24992,0.484853,0.201529,0.375183,-0.197307,-0.339259,0.118162,0.175782,160.0,0
29023,35282.0,-1.093214,0.237255,0.691071,-0.918406,-0.497243,-0.093236,0.478624,0.073893,-1.900617,...,-0.743779,-1.775584,-0.04361,-0.539446,-0.129629,0.845815,-0.506367,-0.275396,103.36,0
251557,155394.0,-0.203154,1.176678,-0.759595,-0.518472,0.629649,-0.721675,0.638893,0.243377,-0.157488,...,0.384629,1.206754,-0.082753,0.508386,-0.710906,-0.23451,0.37964,0.261351,1.94,0
140863,83987.0,1.223101,-0.104798,0.490538,-0.080799,-0.565259,-0.470726,-0.285186,0.002968,0.082704,...,-0.092094,-0.330723,0.027652,0.045086,0.129418,0.874249,-0.075947,-0.000218,19.95,0
70766,54007.0,1.309675,-0.649868,0.786873,-0.751086,-1.196539,-0.25122,-0.868871,0.032659,-0.887516,...,-0.429221,-0.809005,0.127678,0.032551,-0.058609,0.857632,-0.032463,0.012131,23.9,0


In [15]:
new_dataset['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,492
1,492


In [16]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,93297.443089,0.112233,0.003661,0.057048,0.045374,0.112683,0.101576,0.060089,-0.037168,0.072861,...,0.023382,0.016294,-0.005928,-0.029882,0.023901,0.001996,0.040777,-0.02946,0.000475,91.119919
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


By divide dataset into X and Y for classifications by dropping 'class' section from dataset in which we have fraud or legit trasncation details.

In [17]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [18]:
print(X)

            Time        V1        V2        V3        V4        V5        V6  \
19197    30088.0 -0.485407  0.605942  1.089066  0.990843 -0.266805 -0.550272   
29023    35282.0 -1.093214  0.237255  0.691071 -0.918406 -0.497243 -0.093236   
251557  155394.0 -0.203154  1.176678 -0.759595 -0.518472  0.629649 -0.721675   
140863   83987.0  1.223101 -0.104798  0.490538 -0.080799 -0.565259 -0.470726   
70766    54007.0  1.309675 -0.649868  0.786873 -0.751086 -1.196539 -0.251220   
...          ...       ...       ...       ...       ...       ...       ...   
279863  169142.0 -1.927883  1.125653 -4.518331  1.749293 -1.566487 -2.010494   
280143  169347.0  1.378559  1.289381 -5.004247  1.411850  0.442581 -1.326536   
280149  169351.0 -0.676143  1.126366 -2.213700  0.468308 -1.120541 -0.003346   
281144  169966.0 -3.113832  0.585864 -5.399730  1.817092 -0.840618 -2.943548   
281674  170348.0  1.991976  0.158476 -2.583441  0.408670  1.151147 -0.096695   

              V7        V8        V9  .

In [19]:
print(Y)

19197     0
29023     0
251557    0
140863    0
70766     0
         ..
279863    1
280143    1
280149    1
281144    1
281674    1
Name: Class, Length: 984, dtype: int64


 By appling train-test split which was imported from SK-learn model

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# Logistic Regression Model for Classification.

In [21]:
lr_model = LogisticRegression()

In [22]:
lr_model.fit(X_train,y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluation

In [23]:
Accuracy_score_train_LR = accuracy_score(lr_model.predict(X_train),y_train)

In [24]:
print('Accuracy on training data', Accuracy_score_train_LR)

Accuracy on training data 0.9390088945362135


In [25]:
Accuracy_score_test_LR = accuracy_score(lr_model.predict(X_test),y_test)

In [26]:
print('Accuracy on Testing data', Accuracy_score_test_LR)

Accuracy on Testing data 0.934010152284264


In [27]:
print('Accuracy on Training data', Accuracy_score_train_LR)
print('Accuracy on Testing data', Accuracy_score_test_LR)

Accuracy on Training data 0.9390088945362135
Accuracy on Testing data 0.934010152284264


# Random Forest Model

In [28]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)


In [29]:
rf_model.fit(X_train, y_train)

In [30]:
Accuracy_score_train_RF = accuracy_score(rf_model.predict(X_train),y_train)

In [31]:
print('Accuracy on training data', Accuracy_score_train_RF)

Accuracy on training data 1.0


In [32]:
Accuracy_score_test_RF = accuracy_score(rf_model.predict(X_test),y_test)

In [33]:
print('Accuracy on Testing data', Accuracy_score_test_RF)

Accuracy on Testing data 0.9238578680203046


In [34]:
print('Accuracy on training data', Accuracy_score_train_RF)
print('Accuracy on Testing data', Accuracy_score_test_RF)

Accuracy on training data 1.0
Accuracy on Testing data 0.9238578680203046


# XGBoost for CLassification

In [35]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

In [36]:
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [37]:
xgb_preds = xgb_model.predict(X_test)

In [38]:
Accuracy_score_train_XG = accuracy_score(xgb_model.predict(X_train),y_train)

In [39]:
print('Accuracy on training data', Accuracy_score_train_XG)

Accuracy on training data 1.0


In [40]:
Accuracy_score_test_XG = accuracy_score(xgb_model.predict(X_test),y_test)

In [41]:
print('Accuracy on Testing data', Accuracy_score_test_XG)

Accuracy on Testing data 0.9187817258883249


In [42]:
print('Accuracy on training data', Accuracy_score_train_XG)
print('Accuracy on Testing data', Accuracy_score_test_XG)

Accuracy on training data 1.0
Accuracy on Testing data 0.9187817258883249


# Comparing all the models and all the value and making it into one dataframe

In [43]:
models = {
    'Logistic Regression': lr_model,
    'Random Forest': rf_model,
    'XGBoost': xgb_model,

}

results = []

for name, model in models.items():
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    y_test_proba = model.predict_proba(X_test)[:, 1]

    results.append({
        'Model': name,
        'Train Acc': accuracy_score(y_train, y_train_pred),
        'Test Acc': accuracy_score(y_test, y_test_pred),
        'ROC AUC': roc_auc_score(y_test, y_test_proba),
        'F1 Score': f1_score(y_test, y_test_pred),
        'Precision': precision_score(y_test, y_test_pred),
        'Recall': recall_score(y_test, y_test_pred)
    })

In [44]:
df_results = pd.DataFrame(results)
display(df_results)

Unnamed: 0,Model,Train Acc,Test Acc,ROC AUC,F1 Score,Precision,Recall
0,Logistic Regression,0.939009,0.93401,0.982684,0.92973,0.988506,0.877551
1,Random Forest,1.0,0.923858,0.979025,0.917127,1.0,0.846939
2,XGBoost,1.0,0.918782,0.972686,0.913043,0.976744,0.857143
