# Import modules

In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn import preprocessing

# Read Data

In [51]:
df = pd.read_excel("../data/raw/PSP_Jan_Feb_2019.xlsx", index_col=0)

# Create Day of week column

In [52]:
df["Day of Week"] = df["tmsp"].dt.day_name()
df["Day of Month"] = df["tmsp"].dt.day
df["Month"] = df["tmsp"].dt.month

df["Time of Day"] = df["tmsp"].dt.hour

# df.drop('tmsp', axis=1, inplace=True)

df.head(5)

Unnamed: 0,tmsp,country,amount,success,PSP,3D_secured,card,Day of Week,Day of Month,Month,Time of Day
0,2019-01-01 00:01:11,Germany,89,0,UK_Card,0,Visa,Tuesday,1,1,0
1,2019-01-01 00:01:17,Germany,89,1,UK_Card,0,Visa,Tuesday,1,1,0
2,2019-01-01 00:02:49,Germany,238,0,UK_Card,1,Diners,Tuesday,1,1,0
3,2019-01-01 00:03:13,Germany,238,1,UK_Card,1,Diners,Tuesday,1,1,0
4,2019-01-01 00:04:33,Austria,124,0,Simplecard,0,Diners,Tuesday,1,1,0


# Correlation

In [53]:
label = "success"
variables = [
    "country",
    "PSP",
    "3D_secured",
    "card",
    "Day of Week",
    "Day of Month",
    "Time of Day",
    "Month",
]
alpha = 0.05  # p_value threshold

# H0: The variables are independent (if p_value > 0.05)
print(f"H0: The variables are independent (if p_value > {alpha})")
# H1 (alternate hypothesis): There is dependency between the two variables (if p_value < 0.05)
print(
    f"H1 (alternate hypothesis): There is dependency between the two variables (if p_value < {alpha}))"
)

p_values = []

for variable in variables:
    print(f"\nComparing if {variable} has depndency on {label}")
    # Kontingenztabelle
    crosstab = pd.crosstab(df[label], df[variable])
    stat, p_value, dof, expected = chi2_contingency(crosstab)

    # interpret p-value
    print("p value is " + str(p_value))
    dependend = False
    if p_value > alpha:
        print("With 95% confidence the variables are independent. (H0 holds true)")
    else:
        print("With 95% confidence there is dependency. (reject H0)")
        dependend = True

    crosstab.head()
    pd.DataFrame(expected, index=crosstab.index, columns=crosstab.columns).head()
    p_values.append([variable, p_value, dependend])

p_values_df = pd.DataFrame(
    p_values, columns=["column", "p_value", "dependend"]
).set_index("column")
print(p_values_df.head().style.format(escape="latex-math").to_latex())
p_values_df.head(10)

H0: The variables are independent (if p_value > 0.05)
H1 (alternate hypothesis): There is dependency between the two variables (if p_value < 0.05))

Comparing if country has depndency on success
p value is 0.5683037749425706
With 95% confidence the variables are independent. (H0 holds true)

Comparing if PSP has depndency on success
p value is 3.312365253382584e-216
With 95% confidence there is dependency. (reject H0)

Comparing if 3D_secured has depndency on success
p value is 2.392241166548454e-40
With 95% confidence there is dependency. (reject H0)

Comparing if card has depndency on success
p value is 1.144669780335135e-09
With 95% confidence there is dependency. (reject H0)

Comparing if Day of Week has depndency on success
p value is 1.954627276467827e-09
With 95% confidence there is dependency. (reject H0)

Comparing if Day of Month has depndency on success
p value is 2.5257850674978493e-13
With 95% confidence there is dependency. (reject H0)

Comparing if Time of Day has depnde

Unnamed: 0_level_0,p_value,dependend
column,Unnamed: 1_level_1,Unnamed: 2_level_1
country,0.5683038,False
PSP,3.312365e-216,True
3D_secured,2.392241e-40,True
card,1.14467e-09,True
Day of Week,1.954627e-09,True
Day of Month,2.525785e-13,True
Time of Day,2.096508e-10,True
Month,0.7481301,False


In [46]:
sucess_rate_overall = df["success"].mean().round(4)
transactions_overall = df.shape[0]

print(f"sucess_rate_overall = {sucess_rate_overall}")
print(f"transactions_overall = {transactions_overall}")
success_rates = []

for psp in df["PSP"].unique():
    success_rates.append(
        [psp, df[df["PSP"] == psp]["success"].mean(), df[df["PSP"] == psp].shape[0]]
    )

psp_success_rates = (
    pd.DataFrame(success_rates, columns=["PSP", "successrate", "number_transactions"])
    .set_index("PSP")
    .round(2)
)
print(psp_success_rates)
# print(psp_success_rates.style.format(escape="latex-math").to_latex())

sucess_rate_overall = 0.2029
transactions_overall = 50410
            successrate  number_transactions
PSP                                         
UK_Card            0.19                26459
Simplecard         0.16                12446
Moneycard          0.22                 8297
Goldcard           0.41                 3208


In [54]:
# entfernen von 'country' und 'month'
df.drop("country", axis=1, inplace=True)
df.drop("Month", axis=1, inplace=True)

In [55]:
df.head()

Unnamed: 0,tmsp,amount,success,PSP,3D_secured,card,Day of Week,Day of Month,Time of Day
0,2019-01-01 00:01:11,89,0,UK_Card,0,Visa,Tuesday,1,0
1,2019-01-01 00:01:17,89,1,UK_Card,0,Visa,Tuesday,1,0
2,2019-01-01 00:02:49,238,0,UK_Card,1,Diners,Tuesday,1,0
3,2019-01-01 00:03:13,238,1,UK_Card,1,Diners,Tuesday,1,0
4,2019-01-01 00:04:33,124,0,Simplecard,0,Diners,Tuesday,1,0


# Create Previous_attemps column

In [56]:
df_time = pd.DataFrame(df[["tmsp", "amount", "card", "3D_secured"]])
df_time["previous_attemps"] = 0


def count_previous_attempts(group: pd.DataFrame):
    for index, row in group.iterrows():
        current_time = row["tmsp"]
        previous_rows = (
            group.loc[: (index - 1)]
            .loc[
                (group["tmsp"] < current_time)
                & (group["tmsp"] >= current_time - pd.Timedelta(minutes=1))
            ]
            .iloc[-1:]
        )
        if not previous_rows.empty:
            group.loc[index, "previous_attemps"] = (
                group.loc[: (index - 1)].iloc[-1:]["previous_attemps"].item() + 1
            )
    return group


df_time = df_time.groupby(["amount", "card", "3D_secured"]).apply(
    count_previous_attempts, include_groups=False
)
df = pd.merge(df, df_time, on=["tmsp", "amount", "card", "3D_secured"])
df.head(5)

Unnamed: 0,tmsp,amount,success,PSP,3D_secured,card,Day of Week,Day of Month,Time of Day,previous_attemps
0,2019-01-01 00:01:11,89,0,UK_Card,0,Visa,Tuesday,1,0,0
1,2019-01-01 00:01:17,89,1,UK_Card,0,Visa,Tuesday,1,0,1
2,2019-01-01 00:02:49,238,0,UK_Card,1,Diners,Tuesday,1,0,0
3,2019-01-01 00:03:13,238,1,UK_Card,1,Diners,Tuesday,1,0,1
4,2019-01-01 00:04:33,124,0,Simplecard,0,Diners,Tuesday,1,0,0


# Build One Hot Encoding for categorial values

### PSP

In [57]:
pd.get_dummies(df["PSP"])
dummies = pd.get_dummies(df["PSP"], prefix="PSP", drop_first=False, dtype="int")
one_hot_df = pd.concat([df, dummies], axis=1)
one_hot_df.drop(["PSP"], axis=1, inplace=True)
one_hot_df.head()

Unnamed: 0,tmsp,amount,success,3D_secured,card,Day of Week,Day of Month,Time of Day,previous_attemps,PSP_Goldcard,PSP_Moneycard,PSP_Simplecard,PSP_UK_Card
0,2019-01-01 00:01:11,89,0,0,Visa,Tuesday,1,0,0,0,0,0,1
1,2019-01-01 00:01:17,89,1,0,Visa,Tuesday,1,0,1,0,0,0,1
2,2019-01-01 00:02:49,238,0,1,Diners,Tuesday,1,0,0,0,0,0,1
3,2019-01-01 00:03:13,238,1,1,Diners,Tuesday,1,0,1,0,0,0,1
4,2019-01-01 00:04:33,124,0,0,Diners,Tuesday,1,0,0,0,0,1,0


### Card

In [58]:
pd.get_dummies(df["card"])
dummies = pd.get_dummies(
    one_hot_df["card"], prefix="CARD", drop_first=False, dtype="int"
)
one_hot_df = pd.concat([one_hot_df, dummies], axis=1)
one_hot_df.drop(["card"], axis=1, inplace=True)
one_hot_df.head()

Unnamed: 0,tmsp,amount,success,3D_secured,Day of Week,Day of Month,Time of Day,previous_attemps,PSP_Goldcard,PSP_Moneycard,PSP_Simplecard,PSP_UK_Card,CARD_Diners,CARD_Master,CARD_Visa
0,2019-01-01 00:01:11,89,0,0,Tuesday,1,0,0,0,0,0,1,0,0,1
1,2019-01-01 00:01:17,89,1,0,Tuesday,1,0,1,0,0,0,1,0,0,1
2,2019-01-01 00:02:49,238,0,1,Tuesday,1,0,0,0,0,0,1,1,0,0
3,2019-01-01 00:03:13,238,1,1,Tuesday,1,0,1,0,0,0,1,1,0,0
4,2019-01-01 00:04:33,124,0,0,Tuesday,1,0,0,0,0,1,0,1,0,0


### Day of Weeks

In [59]:
label_encoder = preprocessing.LabelEncoder()
one_hot_df["Day of Week"] = label_encoder.fit_transform(one_hot_df["Day of Week"])
print(one_hot_df.head())
print(one_hot_df.head(1).T.style.format(escape="latex").to_latex())

                 tmsp  amount  success  3D_secured  Day of Week  Day of Month  \
0 2019-01-01 00:01:11      89        0           0            5             1   
1 2019-01-01 00:01:17      89        1           0            5             1   
2 2019-01-01 00:02:49     238        0           1            5             1   
3 2019-01-01 00:03:13     238        1           1            5             1   
4 2019-01-01 00:04:33     124        0           0            5             1   

   Time of Day  previous_attemps  PSP_Goldcard  PSP_Moneycard  PSP_Simplecard  \
0            0                 0             0              0               0   
1            0                 1             0              0               0   
2            0                 0             0              0               0   
3            0                 1             0              0               0   
4            0                 0             0              0               1   

   PSP_UK_Card  CARD_Diner

# Save Transformed data to csv file

In [60]:
one_hot_df.drop("tmsp", axis=1).to_csv("../data/processed/transformed_data.csv")

In [61]:
one_hot_df["tmsp"] = pd.to_datetime(one_hot_df["tmsp"])
one_hot_df.head()

Unnamed: 0,tmsp,amount,success,3D_secured,Day of Week,Day of Month,Time of Day,previous_attemps,PSP_Goldcard,PSP_Moneycard,PSP_Simplecard,PSP_UK_Card,CARD_Diners,CARD_Master,CARD_Visa
0,2019-01-01 00:01:11,89,0,0,5,1,0,0,0,0,0,1,0,0,1
1,2019-01-01 00:01:17,89,1,0,5,1,0,1,0,0,0,1,0,0,1
2,2019-01-01 00:02:49,238,0,1,5,1,0,0,0,0,0,1,1,0,0
3,2019-01-01 00:03:13,238,1,1,5,1,0,1,0,0,0,1,1,0,0
4,2019-01-01 00:04:33,124,0,0,5,1,0,0,0,0,1,0,1,0,0
