# 🔍 Detecting Fraud in Credit Card Transactions

The objective of this notebook is to use supervised learning for fraud detection in financial transactions. We work with an anonymized dataset and apply logistic regression to see how well it handles the significant class imbalance.


#Importing The Dependencies

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#Data Collection And Processing

In [None]:
#loading the dataset into a pandas library
credit_df=pd.read_csv('/content/creditcard.csv')

In [None]:
#printing first 5 rows of the dataframe
credit_df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [None]:
#number of rows and columns
credit_df.shape

(284807, 31)

In [None]:
#information about the dataset
credit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [None]:
#distribution of legit and fraud transaction
credit_df.Class.value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,284315
1,492


This is a very highly imbalance dataset.

In [None]:
#separating the data for analysis
legit=credit_df[credit_df.Class==0]
fraud=credit_df[credit_df.Class==1]

In [None]:
legit.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [None]:
fraud.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
541,406.0,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,-2.772272,3.202033,-2.899907,-0.595222,-4.289254,0.389724,-1.140747,-2.830056,-0.016822,0.416956,0.126911,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.17784,0.261145,-0.143276,0.0,1
623,472.0,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,-0.838587,-0.414575,-0.503141,0.676502,-1.692029,2.000635,0.66678,0.599717,1.725321,0.283345,2.102339,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,529.0,1
4920,4462.0,-2.30335,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.56232,-0.399147,-0.238253,-1.525412,2.032912,-6.560124,0.022937,-1.470102,-0.698826,-2.282194,-4.781831,-2.615665,-1.334441,-0.430022,-0.294166,-0.932391,0.172726,-0.08733,-0.156114,-0.542628,0.039566,-0.153029,239.93,1
6108,6986.0,-4.397974,1.358367,-2.592844,2.679787,-1.128131,-1.706536,-3.496197,-0.248778,-0.247768,-4.801637,4.895844,-10.912819,0.184372,-6.771097,-0.007326,-7.358083,-12.598419,-5.131549,0.308334,-0.171608,0.573574,0.176968,-0.436207,-0.053502,0.252405,-0.657488,-0.827136,0.849573,59.0,1
6329,7519.0,1.234235,3.01974,-4.304597,4.732795,3.624201,-1.357746,1.713445,-0.496358,-1.282858,-2.447469,2.101344,-4.609628,1.464378,-6.079337,-0.339237,2.581851,6.739384,3.042493,-2.721853,0.009061,-0.379068,-0.704181,-0.656805,-1.632653,1.488901,0.566797,-0.010016,0.146793,1.0,1


In [None]:
legit.shape,fraud.shape

((284315, 31), (492, 31))

In [None]:
#statistical measures of the data
legit.Amount.describe()

Unnamed: 0,Amount
count,284315.0
mean,88.291022
std,250.105092
min,0.0
25%,5.65
50%,22.0
75%,77.05
max,25691.16


In [None]:
fraud.Amount.describe()

Unnamed: 0,Amount
count,492.0
mean,122.211321
std,256.683288
min,0.0
25%,1.0
50%,9.25
75%,105.89
max,2125.87


In [None]:
#comparing the values for both transactions
credit_df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,0.009824,-0.006576,0.010832,0.000189,0.012064,0.000161,0.007164,0.011535,0.003887,-0.001178,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,-5.676883,3.800173,-6.259393,-0.109334,-6.971723,-0.092929,-4.139946,-6.665836,-2.246308,0.680659,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


# Under-Sampling

Constructing a balanced dataset by under-sampling the majority class.

The original dataset contains only 492 fraudulent transactions, so we reduce the number of legitimate transactions

In order to match this count in order to create a balanced distribution between the two classes.


In [None]:
legit_sample=legit.sample(n=492)

#Concatenating Two Dataframes

In [None]:
new_df=pd.concat([legit_sample,fraud],axis=0)

In [None]:
new_df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
281728,170384.0,-0.295037,0.163593,-0.443831,-2.524225,0.955884,-0.665331,1.258229,-0.272247,-1.788700,-0.012051,-0.285573,-0.868725,-0.671864,0.321315,-1.917508,0.883825,-0.292574,-0.826676,0.826753,0.076442,0.397135,0.980553,-0.530945,-0.917538,0.727205,0.053354,0.009563,0.082040,57.44,0
68524,53031.0,1.280807,0.301482,0.142972,0.614915,-0.249380,-0.881444,0.039313,-0.147336,0.146467,-0.264901,-0.488336,-0.197526,-0.301199,-0.191588,1.185011,0.611096,-0.102256,-0.109441,-0.058968,-0.106519,-0.317913,-0.944568,0.091817,-0.031077,0.244044,0.118668,-0.027439,0.025943,1.98,0
36876,38718.0,-1.046619,-0.548485,3.009854,-1.067561,-1.345360,0.208897,-0.638473,0.376024,0.198752,-0.345650,-1.726508,0.173611,0.021259,-1.446214,-2.226191,-1.884597,0.576316,0.801752,-0.563082,-0.191846,-0.502118,-0.613148,-0.077657,0.445501,0.240237,1.016324,0.264524,0.132209,50.00,0
37623,39039.0,1.256676,-0.199248,-0.204121,-0.710269,-0.553007,-1.405383,0.254957,-0.303143,1.301704,-0.988290,-0.530441,0.305010,-0.965967,0.482216,0.774708,-1.080619,0.197060,-0.191691,0.822058,-0.136377,-0.010340,0.125848,-0.210246,0.416885,0.921413,-0.550777,0.025254,0.014966,31.94,0
156408,108066.0,-1.674036,1.295257,3.280993,4.676067,-0.789438,2.650711,-0.363253,0.298512,2.398090,2.535371,-1.107487,-3.842904,0.318216,-0.436659,-1.401686,-0.563025,1.156020,0.826617,1.314060,0.766523,-0.481816,0.409417,-0.321379,0.602981,0.285975,0.528924,0.119989,-0.522400,83.23,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279863,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.882850,0.697211,-2.064945,-5.587794,2.115795,-5.417424,-1.235123,-6.665177,0.401701,-2.897825,-4.570529,-1.315147,0.391167,1.252967,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.292680,0.147968,390.00,1
280143,169347.0,1.378559,1.289381,-5.004247,1.411850,0.442581,-1.326536,-1.413170,0.248525,-1.127396,-3.232153,2.858466,-3.096915,-0.792532,-5.210141,-0.613803,-2.155297,-3.267116,-0.688505,0.737657,0.226138,0.370612,0.028234,-0.145640,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76,1
280149,169351.0,-0.676143,1.126366,-2.213700,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.652250,-3.463891,1.794969,-2.775022,-0.418950,-4.057162,-0.712616,-1.603015,-5.035326,-0.507000,0.266272,0.247968,0.751826,0.834108,0.190944,0.032070,-0.739695,0.471111,0.385107,0.194361,77.89,1
281144,169966.0,-3.113832,0.585864,-5.399730,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,-5.245984,1.933520,-5.030465,-1.127455,-6.416628,0.141237,-2.549498,-4.614717,-1.478138,-0.035480,0.306271,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.253700,245.00,1


In [None]:
new_df.Class.value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,492
1,492


In [None]:
#comparing the mean value for both transactions
new_df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
0,95980.611789,-0.004127,0.023639,-0.069878,0.033258,-0.037846,0.060472,0.041062,0.087349,0.05467,0.081525,-0.103607,-0.002256,-0.042605,0.013391,-0.000735,-0.018879,-0.007763,0.014521,-0.100467,-0.001525,0.007629,-0.003841,0.02107,0.006389,0.006502,0.009931,0.010269,-0.004206,92.710528
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,-5.676883,3.800173,-6.259393,-0.109334,-6.971723,-0.092929,-4.139946,-6.665836,-2.246308,0.680659,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


#Splitting The Data Into Features & Targets

In [None]:
X=new_df.drop('Class',axis=1)
Y=new_df.Class

In [None]:
X,Y

(            Time        V1        V2  ...       V27       V28  Amount
 281728  170384.0 -0.295037  0.163593  ...  0.009563  0.082040   57.44
 68524    53031.0  1.280807  0.301482  ... -0.027439  0.025943    1.98
 36876    38718.0 -1.046619 -0.548485  ...  0.264524  0.132209   50.00
 37623    39039.0  1.256676 -0.199248  ...  0.025254  0.014966   31.94
 156408  108066.0 -1.674036  1.295257  ...  0.119989 -0.522400   83.23
 ...          ...       ...       ...  ...       ...       ...     ...
 279863  169142.0 -1.927883  1.125653  ...  0.292680  0.147968  390.00
 280143  169347.0  1.378559  1.289381  ...  0.389152  0.186637    0.76
 280149  169351.0 -0.676143  1.126366  ...  0.385107  0.194361   77.89
 281144  169966.0 -3.113832  0.585864  ...  0.884876 -0.253700  245.00
 281674  170348.0  1.991976  0.158476  ...  0.002988 -0.015309   42.53
 
 [984 rows x 30 columns],
 281728    0
 68524     0
 36876     0
 37623     0
 156408    0
          ..
 279863    1
 280143    1
 280149    1
 28

#Splitting The Dataset Into Train And Test Data

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.1,random_state=2,stratify=Y)

In [None]:
X.shape,X_train.shape,X_test.shape

((984, 30), (885, 30), (99, 30))

#Model Training:

##Logistic Regression

In [None]:
model=LogisticRegression(max_iter=8000)

In [None]:
model.fit(X_train,Y_train)

#Model Evaluation

In [None]:
#accuracy score on training data
X_train_prediction=model.predict(X_train)
accuracy=accuracy_score(X_train_prediction,Y_train)
print("Accuracy Score On Training Data: ",accuracy)

Accuracy Score On Training Data:  0.9559322033898305


In [None]:
#accuracy score on test data
X_test_prediction=model.predict(X_test)
accuracy=accuracy_score(X_test_prediction,Y_test)
print("Accuracy Score On Test Data: ",accuracy)

Accuracy Score On Test Data:  0.9292929292929293


## 📌 Summary

Logistic regression provided high accuracy, but due to class imbalance, it's likely overfitting on the majority class. Further steps could include handling imbalance using SMOTE, using ensemble models, and tuning with cross-validation for robust fraud detection.
