In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score

from sklearn import metrics
from collections import Counter

In [None]:
train_data = pd.read_csv("/kaggle/input/fraud-detection/fraudTrain.csv", index_col=0)

In [None]:
train_data.columns

In [None]:
train_data.head()

In [None]:
subset_data = train_data.copy()

In [None]:
import datetime
subset_data = subset_data[["trans_date_trans_time", "dob", "amt", "city_pop", "is_fraud"]]
#subset_data["trans_date"] = datetime.datetime(subset_data["trans_date_trans_time"])
subset_data.head()

In [None]:
subset_data.dtypes

In [None]:
subset_data["trans_date"] = pd.to_datetime(pd.to_datetime(subset_data["trans_date_trans_time"], format="%Y-%m-%d %H:%M:%S").dt.date, format="%Y-%m-%d")
subset_data["dob_date"] = pd.to_datetime(subset_data["dob"], format="%Y-%m-%d")
subset_data.head()

In [None]:
subset_data.dtypes

In [None]:
subset_data["age"] = (subset_data["trans_date"]-subset_data["dob_date"]) / (np.timedelta64(1, 'D')*365)
subset_data.head()

In [None]:
import numpy as np
import pandas as pd
import math

def f1_score(data, y, mask, total_frd, min_recall):
	'''
	It returns the Information Gain of a variable given a loss function.
	y: target variable.
	mask: split choice.
	total_frd: Total Fraud that can be captured in the starting population
	min_recall: Minimum recall set to be achieved
	'''
	
	hit_data = data[mask]
	
	capture = hit_data['fraud_amount'].sum()
	temp_tot_frd = data['fraud_amount'].sum()
	hit_rate = capture/hit_data['transaction_amount'].sum()
	capture_rate = capture/temp_tot_frd

	f1 = (2*hit_rate*capture_rate)/(hit_rate+capture_rate)

	if f1 is None:
		return 0

	if capture_rate < min_recall:
		return 0

	return f1

def max_f1_score_split(data, x, y, total_frd, subset_x, min_recall):
	'''
	Given a predictor & target variable, returns the best split, the error and the type of variable based on a selected cost function.
	x: predictor variable as Pandas Series.
	y: target variable as Pandas Series.
	total_frd: Total Fraud that can be captured in the starting population
	min_recall: Minimum recall set to be achieved
	'''

	split_value = []
	f1 = [] 
	le_gr = []
	print("Checking threshold for {}".format(x.name))
	options = subset_x.sort_values().unique()[1:]
	print("Old size was {}".format(len(options)))
	subset_x = subset_x[subset_x >= 0]

	options = subset_x.sort_values().unique()[1:]

	if len(options) > 100:
		arr_percentiles = np.arange(0,100)
		final_options = np.zeros(100)
		np.percentile(options, arr_percentiles, out = final_options)
	else:
		final_options = options
	print("Current size is {}".format(len(final_options)))

	# Calculate ig for all values
	for ind, val in enumerate(options):
		nonull_data = data[x>-9998]
		mask =   x < val
		val_f1 = f1_score(nonull_data, y, mask, total_frd, min_recall)
		mask_2 = x > val
		val_new_f1 = f1_score(nonull_data, y, mask_2, total_frd, min_recall)
		# Append results
		if val_new_f1 < val_f1:
			le_gr.append(1)
			f1.append(val_f1)
		else:
			le_gr.append(0)
			f1.append(val_new_f1)
		split_value.append(val)

	# Check if there are more than 1 results if not, return False
	if len(f1) == 0:
		return(None,None,None, False)

	else:
	# Get results with highest IG
		best_f1 = max(f1)
		best_f1_index = f1.index(best_f1)
		best_split = split_value[best_f1_index]
		best_ineq = le_gr[best_f1_index]
		return(best_f1,best_split,best_ineq, True)

def get_best_split(y, data, x_vars, total_frd, min_recall):
	'''
	Given a data, select the best split and return the variable, the value, the variable type and the information gain.
	y: name of the target variable
	data: dataframe where to find the best split.
	x_vars: The variables used for decision making
	total_frd: Total fraud that can be captured in the starting population
	min_recall: Minimum recall set to be achieved
	'''

	split_value = []
	f1 = []
	le_gr = []

	for x in x_vars:
		dropped_data = data.dropna(axis=0, subset = [x])
		subset_data = dropped_data[dropped_data[y]==1]
		f1_score, split, ineq, _ = max_f1_score_split(dropped_data, dropped_data[x], dropped_data[y], total_frd, subset_data[x], min_recall)
		if f1_score is None:
			print("Found None")
			f1_score = 0
		le_gr.append(ineq)
		f1.append(f1_score)
		split_value.append(split)

	best_f1 = max(f1)
	best_f1_index = f1.index(best_f1)
	best_split = split_value[best_f1_index]
	best_ineq = le_gr[best_f1_index]
	best_var = x_vars[best_f1_index]
	return(best_var, best_split, best_f1, best_ineq)

def make_split(variable, value, data, ineq):
	'''
	Given a data and a split conditions, do the split.
	variable: variable with which make the split.
	value: value of the variable to make the split.
	data: data to be splitted.
	ineq: Greater than or less than inequality
	'''
	print(variable)
	print(value)
	if(ineq==1):
		data_1 = data[data[variable] < value]
	else:
		data_1 = data[data[variable] > value]

	return data_1

def calc_metrics(data, total_frd):
	'''
	Given the target variable, make a prediction.
	data: pandas series for target variable
	total_frd: Total Fraud that can be captured in the starting population
	'''

	hit_data = data

	capture = hit_data['fraud_amount'].sum()
	hit_rate = capture/hit_data['transaction_amount'].sum()
	capture_rate = capture/total_frd

	return hit_rate, capture_rate

def train_tree(data,y, total_frd, x_vars, max_depth = None, min_samples_split = None, min_recall = None, min_recall_overall = None, min_precision = 0.1, counter = 0):
	'''
	Trains a Decission Tree
	data: Data to be used to train the Decission Tree
	y: target variable column name
	total_frd: Total fraud that can be captured in the starting population
	x_vars: The variables used for decision making
	max_depth: maximum depth to stop splitting.
	min_samples_split: minimum number of observations to make a split.
	min_recall: minimum recall for each threshold selection
	min_recall_overall: minimum recall for the overall rule
	min_precision: Stop once minimum precision reaches a certain level
	'''

	# check for depth conditions
	print(counter)
	precision, recall = calc_metrics(data, total_frd)

	if max_depth == None:
		depth_cond = True

	else:
		if counter < max_depth:
			depth_cond = True

		else:
			depth_cond = False

	# Check for sample conditions
	if min_samples_split == None:
		sample_cond = True

	else:
		if data.shape[0] > min_samples_split:
			sample_cond = True

		else:
			sample_cond = False

	# Check for recall condition
	if min_recall_overall == None:
		recall_cond = True

	else:
		if min_recall_overall < recall:
			recall_cond = True

		else:
			recall_cond = False

	# Check for condition
	if depth_cond & sample_cond & recall_cond:

		var, val, f1, ineq = get_best_split(y, data, x_vars, total_frd, min_recall)
		counter += 1

		new_data = make_split(var, val, data, ineq)

		# Instantiate sub-tree
		if ineq == 1:
			split_type = "<"
		else:
			split_type = ">"
		question = "{} {} {}".format(var, split_type, val)
		new_precision, new_recall = calc_metrics(new_data, total_frd)
		question = question + " Precision:{}, Recall:{}".format(new_precision, new_recall)
		path = [question]
		print(question)


		#Find answers (recursion)

		next_path = train_tree(new_data, y, total_frd, x_vars, max_depth, min_samples_split, min_recall, min_recall_overall, min_precision, counter)

		path.append(next_path)

		return path

	print("Run Complete")
	return None

In [None]:
algo_data = subset_data.copy()
algo_data["transaction_amount"] = algo_data["amt"]
algo_data["fraud_amount"] = np.where(algo_data["is_fraud"]==1, algo_data["amt"], 0)
algo_data = algo_data[["transaction_amount", "city_pop", "age", "fraud_amount", "is_fraud"]]
algo_data.head()

In [None]:
total_frd = algo_data[algo_data["is_fraud"]==1]["transaction_amount"].sum()
print(total_frd)

In [None]:
print(algo_data["fraud_amount"].sum())

In [None]:
algo_data.shape

In [None]:
rf_vars = ["transaction_amount", "city_pop", "age"]
decisions = train_tree(algo_data,"is_fraud", total_frd, rf_vars, max_depth = 50, min_samples_split = None, min_recall = 0.9, min_recall_overall = 0.3, min_precision = None, counter = 0)

In [None]:
decisions