In [1]:
import pandas as pd
import numpy as np
train_data = pd.read_csv("fraudTrain.csv")
test_data = pd.read_csv('fraudTest.csv')

In [2]:
data = pd.concat([train_data, test_data]).reset_index()

In [3]:
#we don't need the first two columns for now so we delete those columns
data.drop(data.columns[:2], axis=1, inplace=True)
data.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [4]:
data.nunique()

trans_date_trans_time    1819551
cc_num                       999
merchant                     693
category                      14
amt                        60616
first                        355
last                         486
gender                         2
street                       999
city                         906
state                         51
zip                          985
lat                          983
long                         983
city_pop                     891
job                          497
dob                          984
trans_num                1852394
unix_time                1819583
merch_lat                1754157
merch_long               1809753
is_fraud                       2
dtype: int64

In [5]:
data.isnull().sum()

trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [6]:
data['gender'] = data['gender'].map({'M' : 1,'F':0})
data['gender']

0          0
1          0
2          1
3          1
4          1
          ..
1852389    1
1852390    1
1852391    0
1852392    1
1852393    1
Name: gender, Length: 1852394, dtype: int64

In [7]:
data.describe()

Unnamed: 0,cc_num,amt,gender,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0,1852394.0
mean,4.17386e+17,70.06357,0.4521959,48813.26,38.53931,-90.22783,88643.67,1358674000.0,38.53898,-90.22794,0.005210015
std,1.309115e+18,159.254,0.4977097,26881.85,5.07147,13.74789,301487.6,18195080.0,5.105604,13.75969,0.07199217
min,60416210000.0,1.0,0.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.02742,-166.6716,0.0
25%,180042900000000.0,9.64,0.0,26237.0,34.6689,-96.798,741.0,1343017000.0,34.74012,-96.89944,0.0
50%,3521417000000000.0,47.45,0.0,48174.0,39.3543,-87.4769,2443.0,1357089000.0,39.3689,-87.44069,0.0
75%,4642255000000000.0,83.1,1.0,72042.0,41.9404,-80.158,20328.0,1374581000.0,41.95626,-80.24511,0.0
max,4.992346e+18,28948.9,1.0,99921.0,66.6933,-67.9503,2906700.0,1388534000.0,67.51027,-66.9509,1.0


<h2>taking insights from the dataset and eliminating some columns<br></h2>

In [8]:
# Feature 1: Transaction Amount Decimal Part
data['amt_decimal'] = data['amt'] % 1

# Ensure 'trans_date_trans_time' is treated as a string
data['trans_date_trans_time'] = data['trans_date_trans_time'].astype(str)

# Feature 2: Age of Cardholder at the Time of Transaction
data['transaction_date'] = pd.to_datetime(data['trans_date_trans_time'].str.split(' ').str[0])
data['cardholder_age'] = (data['transaction_date'] - pd.to_datetime(data['dob'])).dt.days // 365

# Feature 3: Transaction Amount to City Population Ratio
data['amt_to_city_pop_ratio'] = data['amt'] / data['city_pop']

# Display the updated dataset with new features
print(data[['amt_decimal', 'cardholder_age', 'amt_to_city_pop_ratio']].head())

   amt_decimal  cardholder_age  amt_to_city_pop_ratio
0         0.97              30               0.001422
1         0.23              40               0.719664
2         0.11              56               0.052987
3         0.00              52               0.023208
4         0.96              32               0.423838


In [9]:
columns_to_drop = [
    'cc_num',            # Masked credit card numbers
    'trans_date_trans_time',  #We have unix_time
    'transaction_date',  # Same as unix_time
    'first',             # First name
    'last',              # Last name
    'street',            # Street address
    'city',              # City (state information is more relevant)
    'state',             # State (zip code and lat/long provide location info)
    'zip',               # Zip code (redundant with lat/long)
    'dob',               # Date of birth (we've calculated cardholder_age)
    'trans_num',         # Transaction number or identifier
]
data = data.drop(columns=columns_to_drop)

In [10]:
data.head()

Unnamed: 0,merchant,category,amt,gender,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud,amt_decimal,cardholder_age,amt_to_city_pop_ratio
0,"fraud_Rippin, Kub and Mann",misc_net,4.97,0,36.0788,-81.1781,3495,"Psychologist, counselling",1325376018,36.011293,-82.048315,0,0.97,30,0.001422
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,0,48.8878,-118.2105,149,Special educational needs teacher,1325376044,49.159047,-118.186462,0,0.23,40,0.719664
2,fraud_Lind-Buckridge,entertainment,220.11,1,42.1808,-112.262,4154,Nature conservation officer,1325376051,43.150704,-112.154481,0,0.11,56,0.052987
3,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,1,46.2306,-112.1138,1939,Patent attorney,1325376076,47.034331,-112.561071,0,0.0,52,0.023208
4,fraud_Keeling-Crist,misc_pos,41.96,1,38.4207,-79.4629,99,Dance movement psychotherapist,1325376186,38.674999,-78.632459,0,0.96,32,0.423838


In [11]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

categorical_columns = ['merchant', 'category', 'job']
for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])

In [12]:
data.head()

Unnamed: 0,merchant,category,amt,gender,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud,amt_decimal,cardholder_age,amt_to_city_pop_ratio
0,514,8,4.97,0,36.0788,-81.1781,3495,372,1325376018,36.011293,-82.048315,0,0.97,30,0.001422
1,241,4,107.23,0,48.8878,-118.2105,149,431,1325376044,49.159047,-118.186462,0,0.23,40,0.719664
2,390,0,220.11,1,42.1808,-112.262,4154,308,1325376051,43.150704,-112.154481,0,0.11,56,0.052987
3,360,2,45.0,1,46.2306,-112.1138,1939,330,1325376076,47.034331,-112.561071,0,0.0,52,0.023208
4,297,9,41.96,1,38.4207,-79.4629,99,116,1325376186,38.674999,-78.632459,0,0.96,32,0.423838


<h2>all are numerical data so we can go with our model implementations<br></h2>

In [13]:
train_data.shape

(1296675, 23)

In [14]:
k = data.columns.get_loc('is_fraud')

In [15]:
y_train = np.array(data.iloc[:train_data.shape[0],k])
y_test = np.array(data.iloc[train_data.shape[0]:,k])
print(y_train)
y_test

[0 0 0 ... 0 0 0]


array([0, 0, 0, ..., 0, 0, 0])

In [16]:
#splitting data
data = data.drop('is_fraud',axis = 1)
x_train = np.array(data.iloc[:train_data.shape[0],:])
x_test = np.array(data.iloc[train_data.shape[0]:,:])
x_test

array([[3.19000000e+02, 1.00000000e+01, 2.86000000e+00, ...,
        8.60000000e-01, 5.20000000e+01, 8.57578929e-06],
       [5.91000000e+02, 1.00000000e+01, 2.98400000e+01, ...,
        8.40000000e-01, 3.00000000e+01, 9.88079470e-02],
       [6.11000000e+02, 5.00000000e+00, 4.12800000e+01, ...,
        2.80000000e-01, 4.90000000e+01, 1.19666048e-03],
       ...,
       [4.96000000e+02, 7.00000000e+00, 8.68800000e+01, ...,
        8.80000000e-01, 3.90000000e+01, 2.35830619e-02],
       [7.50000000e+01, 1.30000000e+01, 7.99000000e+00, ...,
        9.90000000e-01, 5.50000000e+01, 6.19379845e-02],
       [1.25000000e+02, 0.00000000e+00, 3.81300000e+01, ...,
        1.30000000e-01, 2.70000000e+01, 3.28704063e-04]])

In [17]:
#implementing logistic regression for the above dataset
def sigmoid(z):
    return 1 / (1 + np.exp(-z))
def logistic_regression(X, y, lr=0.01, epochs=1000):
    m, n = X.shape
    theta = np.zeros(n) 

    for epoch in range(epochs):
        z = np.dot(X, theta)
        h = sigmoid(z)
        error = h - y

        gradient = np.dot(X.T, error) / m
        theta -= lr * gradient

    return theta

theta = logistic_regression(x_train, y_train)
print(theta)

  return 1 / (1 + np.exp(-z))


[-2.52067984e+00  1.56847013e-02  2.61834525e+01 -3.33761351e-04
 -2.66843755e-01  6.59785303e-01 -1.42575584e+02 -1.80551122e+00
 -9.64728602e+06 -2.67387282e-01  6.59807210e-01 -3.19748086e-03
 -1.62164883e-01  4.04810796e-02]


In [18]:
def acc(x_train,y_train):
    count = 0
    predicted = []
    for i in range(x_train.shape[0]):
        predicted.append(np.dot(x_train[i],theta))
    for i in range(x_train.shape[0]):
        if predicted[i] > 0.6:
            predicted[i] = 1
        else:
            predicted[i] = 0
        if y_train[i] - predicted[i] == 0 :
            count = count + 1
    accuracy = (count/x_train.shape[0])*100
    return accuracy
print("train accuracy : ", acc(x_train,y_train))
print("test accuracy : ", acc(x_test,y_test))

train accuracy :  99.42113482561166
test accuracy :  99.61401355721146


<h3>Attained an accuracy of 99.614 when the model is Logistic Regression<br></h3>

In [19]:
#decision tree implementation


from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
decision_tree = DecisionTreeClassifier(max_depth=3, random_state=42)


decision_tree.fit(x_train, y_train)


test_predictions = decision_tree.predict(x_test)
train_predictions = decision_tree.predict(x_train)


test_accuracy = accuracy_score(y_test, test_predictions)
train_accuracy = accuracy_score(y_train,train_predictions)


print('train accuracy :',train_accuracy*100)
print("test accuracy:",test_accuracy*100)

train accuracy : 99.46131451597354
test accuracy: 99.61347371603274


<h3>Attained an accuracy of 99.613 with decision tree<br></h3>

In [20]:
#random forest implementation
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 20)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

# calculating the training and testing accuracies
print("Training accuracy :", model.score(x_train, y_train)*100)
print("Testing accuracy :", model.score(x_test, y_test)*100)

Training accuracy : 99.9874293867006
Testing accuracy : 99.7343981400672


<h2>attained an accuracy of 99.743 and highest among all the three models<br></h2>