In [28]:
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import asc, col, isnan, when, count, median, udf, concat, month, year, substring, lit
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import os
import duckdb
import pyarrow
import pandas as pd

In [None]:
conn = duckdb.connect('data_ana.db')

conf = SparkConf() \
    .setAppName("data_ana") \
    .set("spark.driver.memory", "12g")\
    .set("spark.executor.cores","8") \
    .set("spark.sql.execution.arrow.pyspark.enabled","true")
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [4]:
csv_file_path ="/workspace/data.csv"
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)
df.orderBy(asc("fecha_dato"))

                                                                                

DataFrame[fecha_dato: date, ncodpers: double, ind_empleado: string, pais_residencia: string, sexo: string, age: string, fecha_alta: date, ind_nuevo: string, antiguedad: string, indrel: string, ult_fec_cli_1t: date, indrel_1mes: string, tiprel_1mes: string, indresi: string, indext: string, conyuemp: string, canal_entrada: string, indfall: string, tipodom: string, cod_prov: string, nomprov: string, ind_actividad_cliente: string, renta: double, segmento: string, ind_ahor_fin_ult1: int, ind_aval_fin_ult1: int, ind_cco_fin_ult1: int, ind_cder_fin_ult1: int, ind_cno_fin_ult1: int, ind_ctju_fin_ult1: int, ind_ctma_fin_ult1: int, ind_ctop_fin_ult1: int, ind_ctpp_fin_ult1: int, ind_deco_fin_ult1: int, ind_deme_fin_ult1: int, ind_dela_fin_ult1: int, ind_ecue_fin_ult1: int, ind_fond_fin_ult1: int, ind_hip_fin_ult1: int, ind_plan_fin_ult1: int, ind_pres_fin_ult1: int, ind_reca_fin_ult1: int, ind_tjcr_fin_ult1: int, ind_valo_fin_ult1: int, ind_viv_fin_ult1: int, ind_nomina_ult1: string, ind_nom_pen

In [7]:
df = df.withColumn("ncodpers", df["ncodpers"].cast(IntegerType()))
df = df.withColumn("ind_nomina_ult1", df["ind_nomina_ult1"].cast(IntegerType()))
df = df.withColumn("ind_nom_pens_ult1", df["ind_nom_pens_ult1"].cast(IntegerType()))

In [None]:
feature_list = [ "ind_ahor_fin_ult1",
    "ind_aval_fin_ult1",
    "ind_cco_fin_ult1",
    "ind_cder_fin_ult1",
    "ind_cno_fin_ult1",
    "ind_ctju_fin_ult1",
    "ind_ctma_fin_ult1",
    "ind_ctop_fin_ult1",
    "ind_ctpp_fin_ult1",
    "ind_deco_fin_ult1",
    "ind_deme_fin_ult1",
    "ind_dela_fin_ult1",
    "ind_ecue_fin_ult1",
    "ind_fond_fin_ult1",
    "ind_hip_fin_ult1",
    "ind_plan_fin_ult1",
    "ind_pres_fin_ult1",
    "ind_reca_fin_ult1",
    "ind_tjcr_fin_ult1",
    "ind_valo_fin_ult1",
    "ind_viv_fin_ult1",
    "ind_nomina_ult1",
    "ind_nom_pens_ult1",
    "ind_recibo_ult1"]

feature_cols = [F.col(col) for col in feature_list]
features_array = F.array(*feature_cols).alias("features")

grouped_df = df.select("ncodpers", *feature_list).withColumn("features", features_array) \
    .groupBy("ncodpers") \
    .agg(F.collect_list("features").alias("features_list"))

customer_feature = {row["ncodpers"]: row["features_list"] for row in grouped_df.collect()}

                                                                                

In [27]:
counts = {feature: 0 for feature in feature_list}

for features in customer_feature.values():
    for feat_array in features:
        for idx, val in enumerate(feat_array):
            counts[feature_list[idx]] += 1 if val == 1 else 0

In [30]:
feature_occur_times = spark.createDataFrame(list(counts.items()), ["feature", "count"])

In [31]:
feature_occur_times = feature_occur_times.toPandas()

In [None]:
conn.sql("CREATE TABLE feature_occur_times AS SELECT * FROM feature_occur_times")

ParserException: Parser Error: syntax error at or near "DATABASE"