In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
# loading and reading the dataset
df = pd.read_csv('/content/creditcard.csv')

In [None]:
# checking the first 5 rows of the data
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [None]:
# here checking the last 5 rows of the data

df.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.01448,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.05508,2.03503,-0.738589,0.868229,1.058415,0.02433,0.294869,0.5848,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.24964,-0.557828,2.630515,3.03126,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.24044,0.530483,0.70251,0.689799,-0.377961,0.623708,-0.68618,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.0,0
284806,172792.0,-0.533413,-0.189733,0.703337,-0.506271,-0.012546,-0.649617,1.577006,-0.41465,0.48618,...,0.261057,0.643078,0.376777,0.008797,-0.473649,-0.818267,-0.002415,0.013649,217.0,0


In [None]:

df.shape

(284807, 31)

In [None]:
# getting the general information about the data like data type of each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [None]:
# checking if there are any null or NaN values in the dataset or not
df.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


In [None]:

# now let's check the distribution of the normal transactions and fraud trandaction

df['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,284315
1,492


In our dataset, we have two types of target variables: 0 represents normal transactions, while 1 represents fraudulent transactions. We can observe that this is an imbalanced dataset, with over 98% to 99% of the data belonging to the normal transactions (0). To address this imbalance, we can employ techniques such as undersampling or oversampling.

Since we have the right dataset for training and testing, we can now proceed to store the independent and dependent variables in new variables.

In [None]:
# storing the independent and dependent features

X = df.drop('Class', axis=1)

y = df['Class']

In [None]:
X.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99


In [None]:
y.head()

Unnamed: 0,Class
0,0
1,0
2,0
3,0
4,0


Now, here as we have stored the indepedent and dependent variable into the new variable so now we will split the dataset into training and testing



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


**Logistic** **Regression**

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
y_predlr = lr.predict(X_test)

In [None]:
# checking the accuracy on training data
Y_train_prediction = lr.predict(X_train)
training_data_accuracy = accuracy_score(Y_train_prediction,y_train)
print('Training Dataset Accuracy', training_data_accuracy)

Training Dataset Accuracy 0.9989566822495536


In [None]:
accuracy_score(y_test, y_predlr)

0.9988998513628969

In [None]:
print(classification_report(y_test, y_predlr))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.65      0.66      0.66       136

    accuracy                           1.00     85443
   macro avg       0.83      0.83      0.83     85443
weighted avg       1.00      1.00      1.00     85443



In [None]:
confusion_matrix(y_test, y_predlr)

array([[85259,    48],
       [   46,    90]])

After analyzing the confusion matrix, we can observe that the True Positive (TP) records are high, while the True Negative (TN) records are low. Additionally, both the False Negative (FN) and False Positive (FP) records are relatively high. Our primary objective is to reduce the values of FN and FP.

To achieve this, we can apply various techniques, particularly because our dataset is imbalanced. We will use cross-validation (CV) and explore hyperparameter tuning to improve our model's performance. After implementing these strategies, I will evaluate the new score of the model.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV

In [None]:
log_class = LogisticRegression()
grid = {'C':10.0**np.arange(-2,3), 'penalty':['l1','l2']}
cv = KFold(n_splits=5,random_state=None, shuffle=False)

In [None]:
clf = GridSearchCV(log_class, grid,cv=cv,n_jobs=-1,scoring='f1_macro')

In [None]:
clf.fit(X_train, y_train)

25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 56, in _check_solver
    

In [None]:
y_predlrgrid = clf.predict(X_test)
print(confusion_matrix(y_test,y_predlrgrid))
print(accuracy_score(y_test,y_predlrgrid))
print(classification_report(y_test,y_predlrgrid))

[[85267    40]
 [   58    78]]
0.9988530365272755
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.66      0.57      0.61       136

    accuracy                           1.00     85443
   macro avg       0.83      0.79      0.81     85443
weighted avg       1.00      1.00      1.00     85443



**RandomForest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfclassifier = RandomForestClassifier()
rfclassifier.fit(X_train, y_train)

In [None]:
# checking the accuracy on training data
Y_train_rfprediction = rfclassifier.predict(X_train)
training_data_accuracy = accuracy_score(Y_train_rfprediction,y_train)
print('Training Dataset Accuracy', training_data_accuracy)

In [None]:
# accuracy score and classification report of the Random Forest
y_predrfclassifier = rfclassifier.predict(X_test)
print(confusion_matrix(y_test,y_predrfclassifier))
print(accuracy_score(y_test,y_predrfclassifier))
print(classification_report(y_test,y_predrfclassifier))

Now, I will use oversampling and then use the Random Forest Classifier model to check the score again and I will check again the FN and FP

In [None]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [None]:
os = RandomOverSampler(0.75)
X_train_ns, y_train_ns = os.fit_resample(X_train, y_train)
print("The number of classes before fit{}".format(Counter(y_train)))
print("The number of classes after fit{}".format(Counter(y_train_ns)))

In [None]:
from sklearn.ensemble import RandomForestClassifier
osrfclassifier = RandomForestClassifier()
osrfclassifier.fit(X_train_ns, y_train_ns)

In [None]:
y_predosrf = osrfclassifier.predict(X_test)
print(confusion_matrix(y_test,y_predosrf))
print(accuracy_score(y_test,y_predosrf))
print(classification_report(y_test,y_predosrf))

**AUC ROC Curve**

Logistic Classifier

In [None]:
from sklearn.linear_model import LogisticRegression
model_logistic = LogisticRegression()
model_logistic.fit(X_train, y_train)
y_pred_logistic = model_logistic.decision_function(X_test)

In [None]:
from sklearn.metrics import auc, roc_curve

In [None]:
logistic_fpr, logistic_tpr, threshold = roc_curve(y_test, y_pred_logistic)
auc_logistic = auc(logistic_fpr, logistic_tpr)

plt.figure(figsize=(5, 5), dpi=100)

plt.plot(logistic_fpr, logistic_tpr, marker='.', label='Logistic (auc = %0.3f)' % auc_logistic)

plt.xlabel('False Positive Rate -->')
plt.ylabel('True Positive Rate -->')

plt.legend()

plt.show()