In [1]:

import pandas as pd 
import numpy as np 

#import and store dataset

credit_card_data= pd.read_csv('creditcard.csv')
credit_card_data.head()

ModuleNotFoundError: No module named 'pandas'



### Steps for the Analysis
1. **Shuffle the Data**  
   Randomly shuffle the dataset to remove any potential order bias and ensure a fair distribution of classes during splitting.

2. **One-Hot Encoding**  
   Transform categorical variables (e.g., the `Class` column) into binary columns to make the data suitable for machine learning algorithms.

3. **Normalize the Data**  
   Scale the feature values to fall within the range [0, 1], ensuring that all features contribute equally to the model.

4. **Split Features and Labels (X and y)**  
   Separate the dataset into independent variables (X) and target variables (y) for model training.

5. **Convert to NumPy Arrays**  
   Convert the dataframes to NumPy arrays for efficient numerical computation and compatibility with machine learning libraries.

6. **Split Data into Training and Testing Sets**  
   Divide the data into training and testing subsets to evaluate the model's performance on unseen data.



In [None]:
#shuffle and randomize data 
shuffled_data= credit_card_data.sample(frac=1)

In [None]:
#one-hot encoding by changing class column into class_0(1,0) for legit and class_1(0,1) for fraudulent data
one_hote_data= pd.get_dummies(shuffled_data, columns=['Class'])
one_hote_data = one_hote_data.astype(float)
normalized_data= (one_hote_data-one_hote_data.min())/(one_hote_data.max()-one_hote_data.min())

#store clumns v1 to v28 for df_X and column Class_0 and Class_1 for df_ y
df_X= normalized_data.drop(['Class_0', 'Class_1'], axis=1)
df_y= normalized_data[["Class_0", "Class_1"]]

In [None]:
#convert both dataset into np arrays of float32
ar_X, ar_y= np.asarray(df_X.values, dtype="float32"), np.asarray(df_y.values, dtype="float32")


### Important Note
The dataset is highly imbalanced, with significantly fewer instances of fraudulent transactions compared to legitimate ones. To address this imbalance, we can apply **logit weighting**. This technique adjusts the importance of the minority class, ensuring the model pays more attention to underrepresented data during training.


In [None]:
#split data into 80% traininga an 20% test
train_size= int(0.8*len(ar_X))
(raw_X_train, raw_y_tarin)= (ar_X[:train_size], ar_y[:train_size])
(raw_X_test, raw_y_test)= (ar_X[train_size:], ar_y[train_size:])

In [None]:
#let first see the ratio . Note that 0.0017% of the transactions are fraudulent
count_legit, count_fraud=  np.unique(credit_card_data['Class'],return_counts=True)[1]
fraud_ratio= count_fraud/(count_legit+count_fraud)
print('%/ fraud ratio', fraud_ratio)


In [None]:
weighting= 1/fraud_ratio #this will be multiplied by our y data
raw_y_tarin[:,1]= raw_y_tarin[:,1]*weighting

In [None]:
raw_y_tarin[:,1]

In [None]:

#Building the computation model
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

In [None]:
# 30 cells for the input
input_dimensions= ar_X.shape[1]
print(input_dimensions)
#2 cells for the output
output_dimensions= ar_y.shape[1]
print(output_dimensions)
# 100 cells for the first layer
num_layer_1_cells= 100
# 150 cells for the second layer
num_layer_2_cells= 150

In [None]:
#This will be used as input to the model when it comes time to train it, we will assing values at run time
X_train_node= tf.placeholder(tf.float32, [None, input_dimensions], name="X_train" )
y_train_node= tf.placeholder(tf.float32, [None, output_dimensions], name="y_train")

#This will be used as inputs to the model once it is time to test the model
X_test_node= tf.constant(raw_X_test, name="X_test")
y_test_node= tf.constant(raw_y_test, name="y_test")

#the first layer takes in input and passes output to 2nd layer
weight_1_node= tf.Variable(tf.zeros([input_dimensions,num_layer_1_cells]), name="weight_1")
biases_1_node= tf.Variable(tf.zeros([num_layer_1_cells]), name="biases_1")

#the second layer takes in input from the first layer and passes output to 3rd layer
weight_2_node= tf.Variable(tf.zeros([num_layer_1_cells,num_layer_2_cells]), name="weight_2")
biases_2_node= tf.Variable(tf.zeros([num_layer_2_cells]), name="biases_2")

#the third layer takes in input from 2nd layer and output [1,0] or [0,1] depending on the case, whether fraud or legit
weight_3_node= tf.Variable(tf.zeros([num_layer_2_cells,output_dimensions]), name="weight_3")
biases_3_node= tf.Variable(tf.zeros([output_dimensions]), name="biases_3")




### Create a Neural Network Function
Develop a function that processes an input tensor through three distinct layers and outputs a tensor indicating whether a transaction is fraudulent or legitimate. Each layer employs a unique activation function to model the relationships within the data and make accurate predictions based on the input tensor.

In [None]:
def network (input_tensor):
    layer1= tf.nn.sigmoid(tf.matmul(input_tensor,weight_1_node)+biases_1_node)
    #use drop out function to prevent the model from being lazy
    layer2= tf.nn.dropout(tf.nn.sigmoid(tf.matmul(layer1,weight_2_node)+biases_2_node),0.85)
    #use sofmax function because it works well with one-hot coding
    layer3= tf.nn.softmax(tf.matmul(layer2,weight_3_node)+biases_3_node)
    return layer3


### Create a Prediction Function
This function is designed to predict outcomes based on the input training or testing data. It's important to note that `x_train_node` serves as a placeholder, with actual values being provided dynamically at runtime.


In [None]:
y_train_prediction= network(X_train_node)
print(y_train_prediction)
y_test_prediction= network(X_test_node)

In [None]:
# Cross entropy loss function measures difference between actual output and predicted output
cross_entropy= tf.losses.softmax_cross_entropy(y_train_node, y_train_prediction)

#The adam optimizer function will try to minimize loss(cross_entropty) but changing 3 layers' variable
#values at a learning rate of 0.005
optimizer= tf.train.AdamOptimizer(0.005).minimize(cross_entropy)

In [None]:
#create a function to calulate the accuracy
def calculate_accuracy(actual,predicted):
    actual= np.argmax(actual,1)
    predicted= np.argmax(predicted,1)
    return (100*np.sum(np.equal(predicted,actual))/predicted.shape[0])



In [None]:
num_epochs= 100

import time 

In [None]:
with tf.Session() as session:
    tf.global_variables_initializer().run()
    for epoch in range(num_epochs):
        start_time= time.time()
        _,cross_entropy_score= session.run([optimizer,cross_entropy], 
                                           feed_dict= {X_train_node:raw_X_train,
                                                       y_train_node:raw_y_tarin})
        if epoch %10 ==0:
            timer= time.time()- start_time

            print('Epoch: {}'.format(epoch), 'Cureent loss: {0:.5f}'.format(cross_entropy_score),
                   'Elapsed time: {0:.2f}'.format(timer))
            
            final_y_test= y_test_node.eval()
            final_y_test_prediction= y_test_prediction.eval()
            final_accuracy= calculate_accuracy(final_y_test, final_y_test_prediction)
            print('current accuracy {0:.2f}%'.format(final_accuracy))

    final_y_test= y_test_node.eval()
    final_y_test_prediction= y_test_prediction.eval()
    final_accuracy= calculate_accuracy(final_y_test, final_y_test_prediction)
    print('final accuracy {0:.2f}%'.format(final_accuracy))
            

In [None]:
final_fraud_y_test= final_y_test[final_y_test[:,1]==1]
final_fraud_y_test_prediction= final_y_test_prediction[final_y_test[:,1]==1]
final_fraud_accuracy= calculate_accuracy(final_fraud_y_test,final_fraud_y_test_prediction)
print('Final fraud specific accuracy: {0:.2f}%'.format(final_fraud_accuracy))