Credit Card Fraud Detection

   

In [None]:
##import libraries

import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt


from imblearn.under_sampling import RandomUnderSampler

In [None]:
## Read data

train_df = pd.read_csv("/kaggle/input/fraud-detection/fraudTrain.csv")
test_df = pd.read_csv("/kaggle/input/fraud-detection/fraudTest.csv")

In [None]:
## View the train data
train_df.head(5)

In [None]:

train_df.info()

In [None]:

train_df.describe( include = "all")


In [None]:
##checking Train null value
train_df.isna().sum()

In [None]:
## checking Test null values
test_df.isna().sum()

In [None]:
## Check for duplicates

train_df.duplicated().sum()

In [None]:
## Check for duplicates in test data
test_df.duplicated().sum()

In [None]:
## Check a single rows 
train_df.iloc[0,:]

In [None]:
## Check for unique features 
train_df["trans_num"].nunique()

In [None]:
## Check for values which have unique values in them
for column_name in train_df.columns:
    unique_values = train_df[column_name].nunique()
    print(f'Unique values in column {column_name}: {unique_values}')

In [1]:
## Drop unneccesary columns like unnamed:0
dropColumn =["Unnamed: 0","job" ,"cc_num","trans_num","zip" ,"unix_time","merch_lat","merch_long","dob","first","last","street","city"] 
train_df.drop(dropColumn,axis = 1,inplace = True)
test_df.drop(dropColumn,axis = 1,inplace = True)

NameError: name 'train_df' is not defined

In [None]:
## View the data after dropping unnecessary columns
train_df.head(2)

In [None]:
## Create a fuction convert the trans_time_into day of week
def datetoDay(time):
    timestamp = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")

    # Get the day of the week as a string
    day_of_week = timestamp.strftime("%A")
    return day_of_week

In [None]:
## Apply the conversion on each sample of test and train
train_df["trans_date_trans_time"] = train_df["trans_date_trans_time"].apply(datetoDay)
test_df["trans_date_trans_time"] = test_df["trans_date_trans_time"].apply(datetoDay)

In [None]:
train_df.head()

In [None]:
## Check for imbalanced data
train_df["is_fraud"].value_counts()

In [None]:
## Plot the unbalanced data
plt.style.use("ggplot")
sns.countplot(x= train_df["is_fraud"])

In [None]:
## Plot a count plot gender and fraud 
sns.countplot(x= train_df["gender"],hue = "is_fraud",data = train_df)

In [None]:
## Plot the day and fraud occur more
sns.countplot(x= train_df["trans_date_trans_time"],hue = "is_fraud",data = train_df)

## Data Preprocessing

In [None]:
## Import necessary libraries for preprocessing 

from sklearn.preprocessing import OneHotEncoder , StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer 

x_train = train_df.drop("is_fraud" ,axis = 1)
y_train = train_df["is_fraud"]

x_test = test_df.drop("is_fraud" ,axis = 1)
y_test = test_df["is_fraud"]


categ = ["trans_date_trans_time","merchant","category","gender","state"]
numeric = ["amt","lat" ,"long" ,"city_pop"]



encoder = OneHotEncoder()
scaler = StandardScaler()

transf = ColumnTransformer(transformers = [("categorical",encoder ,categ),
                                            ("num",scaler,numeric)],remainder="passthrough")
x_train = transf.fit_transform(x_train)
x_test = transf.transform(x_test)

In [None]:
## Check the shape of each data

x_train.shape,x_test.shape ,y_train.shape ,y_test.shape

In [None]:
## Create a testing function for model
def model_testing(estimator):
    ## Prediction on data
    y_preds = estimator.predict(x_test)
     
    ##Print accuracy score
    print("Accuracy Score : ",accuracy_score(y_test,y_preds))

    ## Print classification report
    print("\nConfussion Matrix :\n " ,confusion_matrix(y_test ,y_preds),"\n")

    ## Check confusion matrix
    print("classification_report: \n\n",classification_report(y_test ,y_preds))

## Model selection

In [None]:
## Import all the models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

### Logistic Regression


In [None]:
## Logistic regression without balancing data
model = LogisticRegression()
model.fit(x_train ,y_train)
model_testing(model)

### DecisionTreeClassifier


In [None]:
## Logistic regression without balancing data
dt = DecisionTreeClassifier()
dt.fit(x_train ,y_train)
model_testing(dt)
dt.score(x_test ,y_test)


### RandomForestClassifier 

In [None]:
## Logistic regression without balancing data
rf = RandomForestClassifier()
rf.fit(x_train ,y_train)
model_testing(rf)
rf.score(x_test ,y_test)

# Model prediction by Balancing Samples

## UnderSampling

In [None]:
## Balance the imbalanced data apply undersampling
from imblearn.under_sampling import RandomUnderSampler

sampler = RandomUnderSampler()

x_train = train_df.drop("is_fraud" ,axis = 1)
y_train = train_df["is_fraud"]

x_test = test_df.drop("is_fraud" ,axis = 1)
y_test = test_df["is_fraud"]

x_sampled ,y_sampled =sampler.fit_resample(x_train ,y_train) 

categ = ["trans_date_trans_time","merchant","category","gender","state"]
numeric = ["amt","lat" ,"long" ,"city_pop"]



encoder = OneHotEncoder()
scaler = StandardScaler()

transf = ColumnTransformer(transformers = [("categorical",encoder ,categ),
                                            ("num",scaler,numeric)],remainder="passthrough")
x_train = transf.fit_transform(x_sampled)
x_test = transf.transform(x_test)

In [None]:
## Check the shape of each data

x_train.shape,x_test.shape ,y_sampled.shape ,y_test.shape

### Logistic Regression 


In [None]:
model = LogisticRegression()
model.fit(x_train ,y_sampled)
model_testing(model)

### DecisionTreeClassifier


In [None]:
## Logistic regression without balancing data
dt = DecisionTreeClassifier()
dt.fit(x_train ,y_sampled)
model_testing(dt)
dt.score(x_test ,y_test)

### RandomForestClassifier 


In [None]:
## Logistic regression without balancing data
rf = RandomForestClassifier()
rf.fit(x_train ,y_sampled)
model_testing(rf)
rf.score(x_test ,y_test)

## OverSampling

In [None]:
## Balance the imbalanced data using Oversampling
from imblearn.over_sampling import RandomOverSampler

Over_sampler = RandomOverSampler()

x_train = train_df.drop("is_fraud" ,axis = 1)
y_train = train_df["is_fraud"]

x_test = test_df.drop("is_fraud" ,axis = 1)
y_test = test_df["is_fraud"]

x_sampled_O ,y_sampled_O =Over_sampler.fit_resample(x_train ,y_train) 

categ = ["trans_date_trans_time","merchant","category","gender","state"]
numeric = ["amt","lat" ,"long" ,"city_pop"]



encoder = OneHotEncoder()
scaler = StandardScaler()

transf = ColumnTransformer(transformers = [("categorical",encoder ,categ),
                                            ("num",scaler,numeric)],remainder="passthrough")
x_train = transf.fit_transform(x_sampled_O)
x_test = transf.transform(x_test)

### LogisticRegression


In [None]:
## Logistic regression without balancing data
model = LogisticRegression()
model.fit(x_train ,y_sampled_O)
model_testing(model)

### DecisionTreeClassifier


In [None]:
## Logistic regression without balancing data
dt = DecisionTreeClassifier()
dt.fit(x_train ,y_sampled_O)
model_testing(dt)
dt.score(x_test ,y_test)

### RandomForestClassifier 


In [None]:
## Logistic regression without balancing data
rf = RandomForestClassifier()
rf.fit(x_train ,y_sampled_O)
model_testing(rf)
rf.score(x_test ,y_test)