## Import libraries and data

In [None]:
import pandas as pd
import numpy as np

from datetime import datetime, date, time, timedelta


import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
%matplotlib inline
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [None]:
df = pd.read_csv("../data/training.csv")
df = df.drop(["CurrencyCode","CountryCode"], axis=1) # identical value across all entries
df.set_index("TransactionId", inplace=True)
df.head()

## Code from feature engineering notebook
To do: Read in csv with newly engineered features instead, then delete the cell below.

In [None]:
# From feature engineering notebook. Delete once data imported here is csv w/new features
features = [ 
             'AccountId',
             'SubscriptionId',
             'CustomerId',
             'ProviderId',
             'ProductId',
             'ProductCategory',
             'ChannelId',
             'PricingStrategy']

new_feature_names2 = list()

for i, feat in enumerate(features):
    n_feat = "NTransactions_" + feat
    new_feature_names2.append(n_feat)
    
    df = pd.merge(df,pd.DataFrame(df.groupby(feat).count()["FraudResult"]).reset_index().rename(columns={'FraudResult': n_feat}))

## Figures
### N (%) of transactions per outcome class

In [None]:
df2 = df[["FraudResult","BatchId"]].groupby("FraudResult").count().reset_index().rename(columns={'BatchId': "Count"})
df2["Percentage"] = round(df2["Count"]/len(df),3)
df2["Percentage"] = pd.Series([" ({0:.1f}%)".format(val * 100) for val in df2["Percentage"]], index = df2.index)
df2["Label"] = df2["Count"].astype(str) + df2["Percentage"]

plt.figure(figsize=(4,4), dpi=120)
g = sns.barplot(data=df2,x="FraudResult",y="Count")
#g.set_yscale("log")
g.text(0, df2["Count"][0]/2, df2["Label"][0], color='white', ha="center")
g.text(1, 250, df2["Label"][1], color='black', ha="center")
g.set_xticks([0, 1])
g.set_xticklabels(["No fraud", "fraud"])
#g.set_yticks([1, 10, 100, 1000, 10000, 100000])
#g.set_yticklabels(['1','10','100','1000','10000','100000'])
g.set(xlabel='', ylabel='Number of transactions',
  title=f"Total number of transactions")
plt.savefig(f'../figures/n_cases.png',dpi=300,bbox_inches="tight")

### Transaction value distributions

In [None]:
plt.figure(figsize=(10,4), dpi=120)
g = sns.histplot(data=df, x="Value", hue="FraudResult", element="step", stat="probability", common_norm=False, log_scale=True)
g.set_xticks([1, 10, 100, 1000, 10000, 100000, 1000000, 10000000])
g.set_xticklabels(['1','10','100','1000','10000','100000','1000000','10000000'])
g.set(xlabel='Transaction value [UGX]', ylabel='Proportion of transactions', xlim=(1, 11000000),
      title="Proportions of (non)fraudulent transactions per transaction value")
plt.xticks(rotation=-90)
plt.legend(title='Transaction', loc='upper left', labels=['Fraudulent', 'Nonfraudulent'])
#plt.show()

plt.savefig('../figures/p_trans_per_value.png',dpi=300,bbox_inches="tight")

### Univariate N of frauds plot

In [None]:
def ntrans_barplot(feat):
    n_feat = "NTransactions_" + feat
    
    df2 = df[[feat,n_feat,"FraudResult"]].groupby([feat,"FraudResult"]).mean().reset_index()
    df2 = df[[n_feat,"FraudResult"]].groupby(["FraudResult"]).mean().reset_index()
    
    plt.figure(figsize=(4,4), dpi=120)
    g = sns.barplot(data=df2,x="FraudResult",y=n_feat)
    g.set_xticks([0, 1])
    g.set_xticklabels(["No fraud", "fraud"])
    g.set(xlabel='', ylabel='Number of transactions',
      title=f"Mean number of transactions per {feat}")
    plt.savefig(f'../figures/ntrans_{feat}.png',dpi=300,bbox_inches="tight")



In [None]:
ntrans_barplot("SubscriptionId")

## Bivariate percentages of frauds plot
Not used in the project presentation, but keep for future reference

In [None]:
def scatterplot(x,y):
    df2 = df[[x, y, "FraudResult"]].groupby([x, y]).mean().reset_index()
    df2["FraudResult"] = df2["FraudResult"]*100
    df2.sort_values(by="FraudResult", inplace=True)
    df2 = df2[df2["FraudResult"]>0]
    
    n_x = len(df2.groupby(x))
    n_y = len(df2.groupby(y))
    
    plt.figure(figsize=((n_x/n_y)*40,(n_y/n_x)*0.5), dpi=120)
    g = sns.scatterplot(data=df2, x=x, y=y, hue="FraudResult", size="FraudResult", sizes=(20, 200))
    g.set(title=f"Proportions of fraudulent transactions per {x} and {y}")
    g.set_yticklabels(g.get_yticks(), size = 5)
    plt.xticks(rotation=90)
    plt.legend(title='% frauds', bbox_to_anchor=(1.01, 1), borderaxespad=0)



In [None]:
scatterplot("ProviderId","AccountId")