# Financial Fraud Data Analysis Using Decision Trees

In [None]:
#Importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import plotly as px

In [None]:
#Reading data
data=pd.read_csv("datasets/financialFraud.csv")

In [None]:
#Summury on dataset
data.describe()

In [None]:
data.info()

In [None]:
#Checking for null values
data.isnull().sum()

In [None]:
#distribution of fraud transaction
data['isFraud'].value_counts()

In [None]:
sns.countplot(data=data,x='isFraud')
plt.title('Distributions of Fraud and Non Fraud transactions')

In [None]:
type = data["type"].value_counts()
quantity = type.values
transactions = type.index

In [None]:
import plotly.express as px
figure = px.pie(data, 
             values=quantity, 
             names=transactions,hole = 0.5, 
             title="Distribution of Transaction Type")
figure.show()

In [None]:
figure.write_html("pieex1.html")

In [None]:
#creating a columns having time stemp of 24hr
data['hourOfDay']=data['step']%24

In [None]:
#lets analyse the fraud on different time stamp of the day
fraud_df=data[data['isFraud']==1]

In [None]:
hourly_fraud_count=fraud_df.groupby('hourOfDay').size()

fig = px.line(data, x=hourly_fraud_count.index, y=hourly_fraud_count.values, title='Number of Fraud Transactions by Hour of the day',labels={'x':"hours",'y':"fraud_count"})
fig.show()

In [None]:
fig.write_html("DtreeLinegraph.html")

In [None]:
#Heatmap to show corelations
hartmap=sns.heatmap(data.corr(), annot=True)

In [None]:
data.dtypes

In [None]:
nonfraud=data[data.isFraud==0]
fraud=data[data.isFraud==1]

In [None]:
nonfraud_s=nonfraud.sample(n=600000)

In [None]:
data2=pd.concat((nonfraud_s,fraud),axis=0)

In [None]:
len(nonfraud_s.index)

In [None]:
data2.nunique(axis=0)

In [None]:
df=data

In [None]:
data['isFraud'].value_counts()

In [None]:
df=df.drop(['nameOrig','nameDest'],axis=1)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
df1=df.sample(600000)

In [None]:
target=df1['isFraud']
inputs=df1.drop('isFraud',axis=1)

In [None]:
target.head()

In [None]:
inputs.head()

In [None]:
encode=LabelEncoder();

In [None]:
#Encoding variable 'type' to numerical
inputs['type_en']=encode.fit_transform(inputs['type'])

In [None]:
inputs

In [None]:
inputs=inputs.drop(['type','isFlaggedFraud','step'],axis=1)

In [None]:
inputs

In [None]:
#splitting data into train and test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(inputs,target,test_size=0.3)

In [None]:
from sklearn import tree

In [None]:
model=tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)
model.fit(x_train,y_train)

In [None]:
xtrain_pred=model.predict(x_train)
xtest_pred=model.predict(x_test)

In [None]:
 from sklearn.metrics import f1_score
f1_score(xtrain_pred,y_train,average="macro")

In [None]:
from sklearn.metrics import accuracy_score
train_acc=accuracy_score(xtrain_pred,y_train)

In [None]:
train_acc

In [None]:
#Joblib file 
