In [1]:
import pandas as pd
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

## Analizamos cómo agrupar interacciones por CIF_ID

In [2]:
df = pd.read_feather("interacciones.feather")

In [3]:
df.head(5)

Unnamed: 0,CIF_ID,IN_OUT,FECHA
0,19385529.0,O,1/11/2018
1,13890033.0,O,1/11/2018
2,13890033.0,O,1/11/2018
3,2156057.0,O,1/11/2018
4,18472299.0,O,1/11/2018


In [4]:
df["IN_OUT"].value_counts(dropna=False)

O    24374873
I    11751905
A     1199592
Name: IN_OUT, dtype: int64

In [5]:
df = df[df["IN_OUT"].isin(['O','I','A'])]

In [6]:
df["FECHA"] = df["FECHA"].str.slice(stop=10) 

In [7]:
df.loc[df["FECHA"].str.contains(" [0-9]", na=False), "FECHA"] = df.loc[df["FECHA"].str.contains(" [0-9]", na=False), "FECHA"].str.slice(stop=8) 

In [8]:
df["FECHA"] = df["FECHA"].str.replace(" ","")

In [9]:
def to_date(s):
    dates = {date:pd.to_datetime(date, format="%d/%m/%Y") for date in s.unique()}
    return s.map(dates)

df["periodo"] = to_date(df["FECHA"])

Transformamos la fecha en anio y mes

In [10]:
def to_yearmonth(s):
    dates = {date:pd.Timestamp(date).strftime('%Y-%m') for date in s.unique()}
    return s.map(dates)

df["periodo"] = to_yearmonth(df["periodo"].dropna())

In [11]:
df["periodo"].value_counts().sort_index().plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0x1e147669630>

In [12]:
data = {fecha:i + 1 for i,fecha in enumerate(df["periodo"].unique())}
data

{'2018-11': 1,
 '2018-12': 2,
 '2019-01': 3,
 '2019-02': 4,
 '2019-03': 5,
 '2019-04': 6,
 '2019-05': 7,
 '2019-06': 8,
 '2019-07': 9,
 '2019-08': 10,
 '2019-09': 11,
 '2019-10': 12,
 '2019-11': 13}

In [13]:
df["periodo_int"] = df["periodo"].map(data)

In [14]:
df = df[df["periodo_int"] < 19]

In [15]:
df.head()

Unnamed: 0,CIF_ID,IN_OUT,FECHA,periodo,periodo_int
0,19385529.0,O,1/11/2018,2018-11,1
1,13890033.0,O,1/11/2018,2018-11,1
2,13890033.0,O,1/11/2018,2018-11,1
3,2156057.0,O,1/11/2018,2018-11,1
4,18472299.0,O,1/11/2018,2018-11,1


In [16]:
to_pivot = df[["CIF_ID","IN_OUT","periodo_int"]]

In [17]:
pivoted = to_pivot.pivot_table(index=["CIF_ID"], columns=["periodo_int","IN_OUT"], aggfunc="size")
pivoted

periodo_int,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10,10,11,11,11,12,12,12,13,13,13
IN_OUT,A,I,O,A,I,O,A,I,O,A,I,O,A,I,O,A,I,O,A,I,O,A,I,O,A,I,O,A,I,O,A,I,O,A,I,O,A,I,O
CIF_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2
0.0,,,2029.0,,,1206.0,,,1394.0,,,994.0,,,1191.0,,,1195.0,,,1216.0,,,356.0,,,1143.0,,,1100.0,,,880.0,,,1432.0,,,2778.0
137.0,,,,,,,,,,,,1.0,,,,,,1.0,,,,,,,,,,,,,,,,,,2.0,,1.0,
162.0,,,1.0,,,1.0,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
240.0,,,,,,,,,,,,,,,,,,,,3.0,1.0,,,,,,,,,,,,,,,,,,
269.0,,,,,,,,,,,,,,,1.0,,2.0,2.0,,,,,,,,,,,,,,,,,,,,,
571.0,,2.0,2.0,,,2.0,,,1.0,,2.0,2.0,,,2.0,,1.0,2.0,,,1.0,,,1.0,,,2.0,,,2.0,,,1.0,,,2.0,,,4.0
573.0,,,1.0,,,,,,1.0,,,,,,,,,1.0,,,1.0,,,,,,,,,1.0,,,1.0,,,1.0,,,1.0
1009.0,,,,,,1.0,,,1.0,,,,,,,,,,,,1.0,,,,,,,,,1.0,,,1.0,,,,,,1.0
1075.0,,,,,,,,,,,,,,1.0,11.0,,,1.0,,1.0,2.0,,,,,,,1.0,16.0,14.0,,5.0,4.0,,2.0,4.0,,2.0,2.0
1103.0,,,1.0,,,1.0,,,,,,1.0,,1.0,,,,1.0,,,,,,1.0,1.0,,,,1.0,1.0,,,3.0,,,1.0,,,2.0


In [18]:
df_interacciones = pd.DataFrame(pivoted.to_records())

In [19]:
df_interacciones = df_interacciones.rename(columns=lambda x: x.replace("(","").replace(")","").replace(", ","_TIPOINT_").replace("'","")) 

In [20]:
df_interacciones.reset_index(drop=True).to_feather("interacciones_tipo_periodo_x_cif_id.feather")

In [21]:
df_interacciones.shape

(1636467, 40)