In [1]:
import pandas as pd

# Loading the datasets
train = pd.read_csv('Dataset/fraudTrain.csv')
test = pd.read_csv('Dataset/fraudTest.csv')
train.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [2]:
def preprocess(DF):
    #Removing redundant columns
    columns = ['Unnamed: 0', 'cc_num', 'merchant', 'first', 'last', 'street', 'city','state', 'zip', 'job', 'trans_num', 'unix_time']
    DF.drop(columns, axis='columns', inplace=True)

    # Calculating age from transaction date and DOB
    DF['trans_date_trans_time']=pd.to_datetime(DF['trans_date_trans_time'])
    DF['trans_date']=DF['trans_date_trans_time'].dt.strftime('%Y-%m-%d')
    DF['trans_date']=pd.to_datetime(DF['trans_date'])
    DF['dob']=pd.to_datetime(DF['dob'])
    DF['age'] = (DF['trans_date']-DF['dob']).dt.days / 365
    DF['trans_month'] = DF['trans_date'].dt.month
    DF['trans_year'] = DF['trans_date'].dt.year

    # Calculating distance between credit card holder and merchant
    DF['lat_distance'] = abs(DF['merch_lat']-DF['lat'])
    DF['long_distance'] = abs(DF['merch_long']-DF['long'])

    # Encoding gender values
    DF['gender'] = DF['gender'].map(lambda x : 1 if x == 'M' else 0)
    DF['gender']=DF['gender'].astype(int)

    # Encoding category values
    DF = pd.get_dummies(DF, columns=['category'])

    columns = ['trans_date_trans_time', 'lat', 'long', 'dob', 'merch_lat', 'merch_long', 'trans_date']
    DF.drop(columns, axis='columns', inplace=True)
    
    return DF

In [3]:
# Preprocessing the train dataset
train_pre = preprocess(train.copy())
train_pre.head()

Unnamed: 0,amt,gender,city_pop,is_fraud,age,trans_month,trans_year,lat_distance,long_distance,category_entertainment,...,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
0,4.97,0,3495,0,30.835616,1,2019,0.067507,0.870215,False,...,False,False,False,False,True,False,False,False,False,False
1,107.23,0,149,0,40.558904,1,2019,0.271247,0.024038,False,...,True,False,False,False,False,False,False,False,False,False
2,220.11,1,4154,0,56.989041,1,2019,0.969904,0.107519,True,...,False,False,False,False,False,False,False,False,False,False
3,45.0,1,1939,0,52.005479,1,2019,0.803731,0.447271,False,...,False,False,False,False,False,False,False,False,False,False
4,41.96,1,99,0,32.786301,1,2019,0.254299,0.830441,False,...,False,False,False,False,False,True,False,False,False,False


In [4]:
# Preprocessing the test dataset
test_pre = preprocess(test.copy())
test_pre.head()

Unnamed: 0,amt,gender,city_pop,is_fraud,age,trans_month,trans_year,lat_distance,long_distance,category_entertainment,...,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
0,2.86,1,333497,0,52.293151,6,2020,0.020491,0.265214,False,...,False,False,False,False,False,False,True,False,False,False
1,29.84,0,302,0,30.446575,6,2020,0.870202,0.475569,False,...,False,False,False,False,False,False,True,False,False,False
2,41.28,0,34496,0,49.70137,6,2020,0.17709,0.659611,False,...,False,True,False,False,False,False,False,False,False,False
3,60.05,1,54767,0,32.931507,6,2020,0.242698,0.063961,False,...,False,False,False,False,False,True,False,False,False,False
4,3.19,1,1126,0,65.005479,6,2020,0.706248,0.867734,False,...,False,False,False,False,False,False,False,False,False,True


In [5]:
from sklearn.model_selection import train_test_split

# Splitting the train dataset into training and testing dataset in the ratio 3:1
X_train, X_test, Y_train, Y_test = train_test_split(train_pre.drop(['is_fraud'], axis='columns'), train_pre['is_fraud'], test_size = 0.25, random_state = 45)

In [6]:
from sklearn.linear_model import LogisticRegression

# Using Logistic Regression classifier
LR_model = LogisticRegression()
LR_model.fit(X_train, Y_train)

In [7]:
from sklearn.metrics import accuracy_score, classification_report

Y_predict = LR_model.predict(X_test)
accuracy = accuracy_score(Y_test, Y_predict)
print(accuracy)
report = classification_report(Y_test, Y_predict, zero_division=1)
print(report)

0.9935373215822612
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    322251
           1       0.00      0.00      1.00      1918

    accuracy                           0.99    324169
   macro avg       0.50      0.50      1.00    324169
weighted avg       0.99      0.99      1.00    324169



In [8]:
# Testing Logistic Regression model on test data set
test_predict = LR_model.predict(test_pre.drop(['is_fraud'], axis='columns'))

# Observing performance of model on the given dataset
accuracy = accuracy_score(test_pre['is_fraud'], test_predict)
print("Accuracy of model: ", accuracy)
report = classification_report(test_pre['is_fraud'], test_predict, zero_division=1)
print("Classification Report:\n", report)

Accuracy of model:  0.9955157192753892
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.00      0.00      1.00      2145

    accuracy                           1.00    555719
   macro avg       0.50      0.50      1.00    555719
weighted avg       0.99      1.00      1.00    555719



In [9]:
from sklearn.tree import DecisionTreeClassifier

# Using Decision Tree classifier
DT_model = DecisionTreeClassifier()
DT_model.fit(X_train, Y_train)

In [10]:
Y_predict = DT_model.predict(X_test)
accuracy = accuracy_score(Y_test, Y_predict)
print(accuracy)
report = classification_report(Y_test, Y_predict, zero_division=1)
print(report)

0.9962056828382726
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    322251
           1       0.67      0.70      0.68      1918

    accuracy                           1.00    324169
   macro avg       0.84      0.85      0.84    324169
weighted avg       1.00      1.00      1.00    324169



In [11]:
# Testing Decision Tree model on test data set
test_predict = DT_model.predict(test_pre.drop(['is_fraud'], axis='columns'))

# Observing performance of model on the given dataset
accuracy = accuracy_score(test_pre['is_fraud'], test_predict)
print("Accuracy of model: ", accuracy)
report = classification_report(test_pre['is_fraud'], test_predict, zero_division=1)
print("Classification Report:\n", report)

Accuracy of model:  0.9965432169855628
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.55      0.62      0.58      2145

    accuracy                           1.00    555719
   macro avg       0.77      0.81      0.79    555719
weighted avg       1.00      1.00      1.00    555719



In [12]:
from sklearn.ensemble import RandomForestClassifier

# Using Random Forest classifier
RF_model = RandomForestClassifier(n_estimators = 50)
RF_model.fit(X_train, Y_train)

In [13]:
Y_predict = RF_model.predict(X_test)
accuracy = accuracy_score(Y_test, Y_predict)
print(accuracy)
report = classification_report(Y_test, Y_predict, zero_division=1)
print(report)

0.9974951337111199
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    322251
           1       0.88      0.67      0.76      1918

    accuracy                           1.00    324169
   macro avg       0.94      0.83      0.88    324169
weighted avg       1.00      1.00      1.00    324169



In [14]:
# Testing Random Forest model on test data set
test_predict = RF_model.predict(test_pre.drop(['is_fraud'], axis='columns'))

# Observing performance of model on the given dataset
accuracy = accuracy_score(test_pre['is_fraud'], test_predict)
print("Accuracy of model: ", accuracy)
report = classification_report(test_pre['is_fraud'], test_predict, zero_division=1)
print("Classification Report:\n", report)

Accuracy of model:  0.9979629992856102
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.82      0.61      0.70      2145

    accuracy                           1.00    555719
   macro avg       0.91      0.80      0.85    555719
weighted avg       1.00      1.00      1.00    555719

