In [None]:
# Data wrangling
import json as js
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
import os
from datetime import datetime

# Data Viz
import cufflinks as cf
from sklearn import set_config

# Preprocesamiento
from varclushi import VarClusHi
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE, MDS
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from functools import reduce
from scipy.stats import ks_2samp
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import MDS,TSNE
from sklearn.cluster import AgglomerativeClustering,KMeans
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture

import matplotlib.pyplot as plt

from scipy.stats import chisquare
from scipy.stats import kruskal
from statsmodels.stats.multicomp import MultiComparison
import seaborn as sns

# Clustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import davies_bouldin_score, silhouette_score
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering


%config Completer.use_jedi = False
# Enviroment setup
cf.go_offline()
set_config(display='diagram')
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [None]:
city= pd.read_csv("./Datos/city.csv")
fraud= pd.read_csv("./Datos/fraud.csv")
holder= pd.read_csv("./Datos/holder.csv")
trans= pd.read_csv("./Datos/trans.csv")

In [None]:
city.shape, fraud.shape, holder.shape, trans.shape

In [None]:
trans.head()

In [None]:
fraud.head()

In [None]:
holder.head()

In [None]:
city.head()

In [None]:
df=pd.merge(holder,city,how="left",on="id_city")
df=pd.merge(df,trans,how="left",on="id_holder")

In [None]:
df.shape,trans.shape

In [None]:
df["aux"]=1

In [None]:
df[["id_holder","aux"]].groupby("id_holder").sum().describe()

In [None]:
df.columns

In [None]:
df.head()

## Etiquetado de Variables

In [None]:
c_feats=['city_pop', 'amt', 'lat', 'long', 'merch_lat', 'merch_long']#Continuas
v_feats=["gender",'job','city', 'state','zip','cc_num','category']# Discretas
d_feats=['dob','unix_time','trans_date_trans_time', 'merchant'] # Fecha
t_feats=['first', 'last','street'] #Texto




c_feats_new=["c_"+"_".join(x.split()) for x in c_feats]
v_feats_new=["v_"+"_".join(x.split()) for x in v_feats]
d_feats_new=["d_"+"_".join(x.split()) for x in d_feats]
t_feats_new=["t_"+"_".join(x.split()) for x in t_feats]



df.rename(columns=dict(zip(d_feats,d_feats_new)),inplace=True)
df.rename(columns=dict(zip(v_feats,v_feats_new)),inplace=True)
df.rename(columns=dict(zip(t_feats,t_feats_new)),inplace=True)
df.rename(columns=dict(zip(c_feats,c_feats_new)),inplace=True)


In [None]:
df.head()

In [None]:
df.filter(like="v_").head()

## Calidad de los Datos

### Duplicados

In [None]:
#Verificamos el total de duplicados
df.duplicated().sum()

## Completitud 

In [None]:
completitud = pd.DataFrame((1-df.isnull().sum()/df.shape[0])*100).reset_index().rename(columns={"index":"columna",0:"completitud"})

In [None]:
completitud

## Analisis Exploratorio

In [None]:
df.v_gender.iplot("hist")

In [None]:
df.v_gender.value_counts(1)

In [None]:
df.v_category.value_counts(1)

In [None]:
df.v_job.value_counts(1)

In [None]:
len(list(set(holder.job)))

In [None]:
holder.job.value_counts(0)

### Fechas

In [None]:
df["d_trans_date_trans_time"]=pd.to_datetime(df["d_trans_date_trans_time"])

In [None]:
df["d_dob"]=pd.to_datetime(df["d_dob"])

In [None]:
df["edad"]=df["d_trans_date_trans_time"]-df["d_dob"]
df["edad"]=df.edad.map(lambda x:x.days)

In [None]:
df["edad"]/=360

In [None]:
df["edad"].describe()

In [None]:
df.filter(like="d_").head()

In [None]:
df["d_trans_date_trans_time"].describe()

In [None]:
df["d_trans_date_trans_time"].diff(1).describe(percentiles=[0.25,0.50,0.70,0.90,0.95,0.98,0.99])

In [None]:
df[["d_trans_date_trans_time","aux"]].groupby("d_trans_date_trans_time").sum()

In [None]:
aux_df=df[["id_holder","d_trans_date_trans_time"]].copy()

In [None]:
aux_df=aux_df.sort_values(by=["id_holder","d_trans_date_trans_time"]).reset_index(drop=True)

In [None]:
aux_df.groupby("id_holder").diff(1).reset_index()

In [None]:
df[["id_holder","d_trans_date_trans_time"]].groupby(["id_holder"]).diff()

In [None]:
df[df["id_holder"]==200001]["d_trans_date_trans_time"].diff(1).drop(0).iplot("hist")

In [None]:
df[df["id_holder"]==200001]["d_trans_date_trans_time"].diff(1)

In [None]:
df[df["id_holder"]==200001]["d_trans_date_trans_time"].diff(1).drop(0).describe()


In [None]:
df["d_date_trans"]=df.d_trans_date_trans_time.map(lambda x:x.date())

In [None]:
np.unique(df[df["id_holder"]==200001]["d_date_trans"])

In [None]:
np.unique(df.d_trans_date_trans_time.map(lambda x:x.isocalendar()[1]))

In [None]:
len(np.unique(df[df["id_holder"]==200001]["d_date_trans"]))

In [None]:
df["week"]=df.d_date_trans.map(lambda x:str(x.year)+"_"+str(x.isocalendar()[1]))
df["mes"]=df.d_date_trans.map(lambda x:x.strftime('%Y-%m'))

In [None]:
cat=df[['mes']].drop_duplicates().sort_values(by='mes').reset_index(drop=True)
cat['id_mes']=cat.index +1

In [None]:
cat

In [None]:
df = df.merge(cat,on='mes',how='inner')

In [None]:
df.sort_values(by=['id_holder', 'd_trans_date_trans_time'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
list_time_tran=[]
for id_hold in df['id_holder'].unique():
    aux_i = df[df['id_holder']==id_hold]
    list_time_tran.append(np.nan)
    for ind in range(aux_i.index[0],aux_i.index[-1]):
        list_time_tran.append((aux_i['d_trans_date_trans_time'][ind+1]-aux_i['d_trans_date_trans_time'][ind])/np.timedelta64(1,'D'))


In [None]:
df['time_bet_tran'] = np.array(list_time_tran)

In [None]:
completitud = pd.DataFrame((1-df.isnull().sum()/df.shape[0])*100).reset_index().rename(columns={"index":"columna",0:"completitud"})
completitud

In [None]:
df.shape, df.dropna().shape

In [None]:
df.dropna(inplace=True)

In [None]:
df.head()

In [None]:
agrupar=["id_holder","v_category","id_mes"]

In [None]:
df.rename(columns={'c_amt':'amt'}, inplace=True)

In [None]:
df_aux=pd.pivot_table(df,index=agrupar,values=['aux', 'edad','amt', 'time_bet_tran'],
                aggfunc={'amt': np.sum,
                         'edad': np.max,
                         'aux':np.sum,
                         'time_bet_tran': np.mean}
              )

In [None]:
df_aux.reset_index(drop=False,inplace=True)

In [None]:
df_aux


In [None]:
vobs=6

In [None]:
anclai,anclaf = cat['id_mes'].min(),cat['id_mes'].max()
anclai = anclai+vobs-1
anclai,anclaf

In [None]:
def ing(df,ancla,k):
    not_=['id_holder','v_category','ancla']
    aux_1 = df.loc[(df['id_mes']>=(ancla-k+1))&(df['id_mes']<=ancla)].copy()
    for var in ['amt','aux','edad', 'time_bet_tran']:
        aux = aux_1.pivot_table(index=['id_holder','v_category'],
                              columns=['id_mes'],
                              aggfunc=['sum'],
                              values=var,
                              fill_value=0
                             )
        aux.columns = [f'x_{f}_{var}_{v}_{k}' for f,v in aux.columns]
        aux.reset_index(inplace=True)
        aux.insert(1,'ancla',ancla)
        columns_check=[x for x in aux.columns if x not in not_]
        aux[f'x_prom_{var}_mes_{k}']= aux[columns_check].mean(axis = 1, skipna = True)
        aux[f'x_max_{var}_mes_{k}']= aux[columns_check].max(axis = 1, skipna = True)
        aux[f'x_min_{var}_mes_{k}']= aux[columns_check].min(axis = 1, skipna = True)
        aux[f'x_std_{var}_mes{k}']= aux[columns_check].std(axis = 1, skipna = True)
        aux[f'x_median_{var}_mes_{k}']= aux[columns_check].median(axis = 1, skipna = True)
        aux[f'x_total_{var}_mes_{k}']= aux[columns_check].sum(axis = 1, skipna = True)
        
        if var=='amt':
            final=aux
        else:
            final=pd.merge(final,aux,how="left",on=['id_holder','v_category','ancla'])
        


    
    return final



In [None]:
varc = ['amt','aux','edad', 'time_bet_tran']
um = ['id_holder','v_category','ancla']

In [None]:
cruzar = lambda x,y:pd.merge(x,y,on=um,how='outer')

In [None]:
from functools import reduce

In [None]:
X = pd.concat(map(lambda ancla:reduce(lambda x,y:pd.merge(x,y,on=um,how='outer'),
       map(lambda k:ing(df,ancla,k),[1,2,3,4,5,6])),range(anclai,anclaf+1)),
              ignore_index=True)

In [None]:
X

In [None]:
var = sorted(X.filter(like='x_').columns)
len(var)

In [None]:
completitud = pd.DataFrame((1-X.isnull().sum()/X.shape[0])*100).reset_index().rename(columns={"index":"columna",0:"completitud"})

In [None]:
completitud

In [None]:
miss = 1-X[var].describe().T[['count']]/len(X)
fuera = sorted(miss.loc[miss['count']>0.3].index)
X.drop(fuera,axis=1,inplace=True)
var = [v for v in var if v not in fuera]

In [None]:
len(var)

In [None]:
im = SimpleImputer(strategy='median')

In [None]:
im.fit(X[var])

In [None]:
X.columns

In [None]:
Xi = pd.DataFrame(im.transform(X[var]),columns=var)
Xi[um] = X[um]
Xi["id_holder"] = X["id_holder"]

In [None]:
Xi

In [None]:
Xi.dropna().shape,Xi.shape

In [None]:
ks = pd.DataFrame(map(lambda v:(v,ks_2samp(Xi[v],X[v].dropna()).statistic),var),columns=['var','ks'])

In [None]:
ks

In [None]:
rotas = sorted(ks.loc[ks['ks']>0.1]['var'])

In [None]:
rotas

In [None]:
Xi["id_holder"]

In [None]:
for v,li,ls in Xi[var].describe(percentiles=[0.01,0.99]).T[['1%','99%']].reset_index().values:
    Xi[f'ex_{v}'] = ((Xi[v]<li)|(Xi[v]>ls)).astype(int)
Xi['ex_'] = Xi.filter(like='ex_').max(axis=1)

In [None]:
Xi['ex_'].value_counts(True)

In [None]:
Xi.head()

In [None]:
def get_bounds(x):
  #Obtener los limiter cuantilicos para el metodo de IQR para detectar outliers
    q3 = x.quantile(0.75)
    q1 = x.quantile(0.25)
    iqr = q3 - q1
    lb = q1 - 1.5*iqr
    ub = q3 + 1.5*iqr
    return pd.Interval(lb, ub, closed="both")

In [None]:
dc_out = {feat: get_bounds(Xi[feat]) for feat in var}

In [None]:
for col in var:
    Xi[f"ol_{col}"] = Xi[col].map(lambda x: x not in dc_out[col]).astype(int)

In [None]:
Xi["ol"] = Xi[[x for x in Xi.columns if x.startswith("ol")]].mean(axis=1)

In [None]:
Xi["ol"].describe([0.8, 0.85, 0.9, 0.95, 0.96, 0.97, 0.98, 0.99])

In [None]:
Xi[Xi["ol"]<0.2].drop(columns=[x for x in Xi.columns if x.startswith("ol")]).shape[0]/Xi.shape[0]

In [None]:
Xi = Xi[Xi["ol"]<0.2].drop(columns=[x for x in Xi.columns if x.startswith("ol")])

In [None]:
Xi.shape

In [None]:
Xi[var].sample(n=2000).hist()

In [None]:
Xi.loc[Xi['ex_']==0][var].sample(n=2000).hist()

In [None]:
Xi = Xi.loc[Xi['ex_']==0].reset_index(drop=True).drop(Xi.filter(like='ex_').columns,axis=1)

In [None]:
Xi

In [None]:
len(Xi.columns)

In [None]:
vc = VarClusHi(df=Xi.sample(5000),feat_list=var)
vc.varclus()
rs = vc.rsquare
rs = rs.sort_values(by=['Cluster','RS_Ratio']).reset_index(drop=True)
rs['id'] = rs.groupby('Cluster').cumcount()+1
rs

In [None]:
var = sorted(rs.loc[rs['id']==1]['Variable'])

In [None]:
Xi[var].corr()

In [None]:
var

In [None]:
Xi.shape

In [None]:
Xi_aux=Xi.sample(frac=0.1)

In [None]:
Xi_aux.shape

## PCA

In [None]:
sc = StandardScaler()
pca = PCA(n_components=3)
Xp = pd.DataFrame(pca.fit_transform(sc.fit_transform(Xi_aux[var])))
print(pca.explained_variance_ratio_.cumsum())
Xp

## Visualizacion

In [None]:
Xp.sample(frac=0.07).iplot(kind='scatter3d',x=0,y=1,z=2,mode='markers',color='purple')

In [None]:
Xp.sample(frac=0.07).iplot(kind='scatter',x=0,y=1,mode='markers',color='purple')

## Aglomerativo

In [None]:
sc = MinMaxScaler()
Xs = pd.DataFrame(sc.fit_transform(Xi_aux[var]),columns=var)

In [None]:
#Con la nueva variable time_bet_tran
sil = pd.DataFrame(map(lambda k:(k,silhouette_score(Xs,
                                              AgglomerativeClustering(n_clusters=k).fit_predict(Xs))),
                 range(2,10)),columns=['k','sil'])
plt.plot(sil['k'],sil['sil'],marker='o')

In [None]:
plt.plot(sil['k'],sil['sil'],marker='o')

In [None]:
k = 4
tipo = 'agg'
agg = AgglomerativeClustering(n_clusters=k)
Xi_aux[f'cl_{tipo}']=Xp[f'cl_{tipo}']=agg.fit_predict(Xs[var])

## K-Medias

In [None]:
#Con la nueva variable time_bet_tran
sil = pd.DataFrame(map(lambda k:(k,silhouette_score(Xs,
                                              KMeans(n_clusters=k,max_iter=1000).fit_predict(Xs))),
                 range(2,10)),columns=['k','sil'])
plt.plot(sil['k'],sil['sil'],marker='o')

In [None]:
sil = pd.DataFrame(map(lambda k:(k,silhouette_score(Xs,
                                              KMeans(n_clusters=k,max_iter=1000).fit_predict(Xs))),
                 range(2,10)),columns=['k','sil'])
plt.plot(sil['k'],sil['sil'],marker='o')

In [None]:
k = 3
tipo = 'kme'
kme = KMeans(n_clusters=k,max_iter=1000)
Xi_aux[f'cl_{tipo}']=Xp[f'cl_{tipo}']=kme.fit_predict(Xs[var])

## Gausisianos Mixtos

In [None]:
#Con la nueva variable time_bet_tran
sil = pd.DataFrame(map(lambda k:(k,silhouette_score(Xs,
                                              GaussianMixture(n_components=k,max_iter=1000).fit_predict(Xs))),
                 range(2,10)),columns=['k','sil'])
plt.plot(sil['k'],sil['sil'],marker='o')

In [None]:
sil = pd.DataFrame(map(lambda k:(k,silhouette_score(Xs,
                                              GaussianMixture(n_components=k,max_iter=1000).fit_predict(Xs))),
                 range(2,10)),columns=['k','sil'])
plt.plot(sil['k'],sil['sil'],marker='o')

In [None]:
k = 5
tipo = 'gmm'  
gmm = GaussianMixture(n_components=k,max_iter=1000)
Xi_aux[f'cl_{tipo}']=Xp[f'cl_{tipo}']=gmm.fit_predict(Xs[var])

In [None]:
Xi_aux

In [None]:
Xp

In [None]:
varcl = sorted(Xi_aux.filter(like='cl_'))
for v in varcl:
    Xp[v] = Xp[v].astype(str)
    Xi_aux[v] = Xi[v].astype(str)
    
pd.DataFrame(map(lambda cl:(cl,silhouette_score(Xs,Xi_aux[cl])),varcl),columns=['cluster','sil']).iplot(kind='bar',categories='cluster')

In [None]:
    
pd.DataFrame(map(lambda cl:(cl,silhouette_score(Xs,Xi_aux[cl])),varcl),columns=['cluster','sil']).iplot(kind='bar',categories='cluster')

In [None]:
Xs

## Perfilamiento

In [None]:
var

In [None]:
pd.DataFrame(map(lambda v:(v,
              kruskal(*[d[v].reset_index(drop=True) for cl,d in Xi_aux[['cl_kme',v]].groupby('cl_kme')]).pvalue),var),
             columns=['variable','p-value']).round(2)

In [None]:
for v in var:
    print(v)
    display(MultiComparison(Xi_aux[v],Xi_aux['cl_kme']).tukeyhsd().summary())
    plt.figure()
    sns.boxplot(data=Xi_aux,y=v,x='cl_kme')

In [None]:
display(Xi_aux[var+['cl_kme']].groupby('cl_kme').mean())
display(Xi_aux[var].mean().to_frame().T)

### Comparando con fraudes

In [None]:
Xp1 = Xp.copy()

In [None]:
Xp1["id_holder"] = Xi_aux["id_holder"].reset_index(drop=True)

In [None]:
Xp1=pd.merge(Xp1,trans,how="left",on="id_holder")

In [None]:
fraud["is_fraud"] = 1

In [None]:
Xp1 = pd.merge(Xp1,fraud,how="left",on="trans_num")

In [None]:
Xp1["is_fraud"]=Xp1["is_fraud"].fillna(0)

In [None]:
Xp_t = Xp1.groupby(['id_holder','cl_agg','cl_kme','cl_gmm'])\
        .aggregate({'is_fraud':'sum'})\
        .reset_index()

In [None]:
Xp_t["is_fraud"].value_counts(1)

In [None]:
for cluster in ["cl_agg",'cl_kme','cl_gmm']:
    Xp_t.boxplot(column="is_fraud",by=cluster)