# Generacion de la base de datos de clientes y de las transacciones

In [7]:
# Libraries to import 
import random
import pandas as pd
import datetime
import time
from azure.eventhub import EventHubProducerClient, EventData
import json

StatementMeta(, 353e6ef4-3d09-4a45-aefb-fb91ec5e8b02, 14, Finished, Available, Finished)

In [25]:
# configuracion la conexion al cliente
producer = EventHubProducerClient.from_connection_string(conn_str=connection_str, eventhub_name=entity_name)

StatementMeta(, 1b1f9f77-5627-4bf2-b45b-e08f9306d116, 42, Finished, Available, Finished)

In [4]:
class BankPopulation():

    PROB_KYC = [0.85, 0.15]
    PROB_PEP = [0.03, 0.97]
    PROB_REGION = [0.02, 0.05, 0.15, 0.30, 0.48]
    PROB_EMPLOYED = [0.94, 0.06]
    PROB_RISK_NAIC = [0.05, 0.95]
    PROB_NAME_SCREANING_HIT = [0.05, 0.95]
    PROB_ADVERSE_MEDIA_SCREANING_HIT = [0.03, 0.97]

    def __init__(self, num_customers: int):
        self.num_customers = num_customers

    def __assign_value(self, values: list, prob: list, n_cust: int) -> list:
        return random.choices(values, weights=prob, k=n_cust)


    def create_customers(self) -> pd.DataFrame:
        """
        Method to create a random ids customers sample with size defined previously.

        params: self
        """

        # customer id will be between 100000 and 999999
        possibles_ids =  ["{:09d}".format(i) for i in range(1, self.num_customers + 1)]
        # create a sample to avoid duplicates
        client_id = random.sample(possibles_ids, self.num_customers)
        # create a final df with all ids
        client_id_df = pd.DataFrame(client_id, columns=["customer_id"])
        return client_id_df
    

    def kyc_identification(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Method to randomly create a variable is_kyc_identified which indicates if the customer has sent official documents to identify themselves.

        params: df: Dataframe with customers ids
        returns: df: Dataframe with user id and if identified
        """
        values_kyc = [1,0]
        is_kyc_identified = self.__assign_value(values_kyc, self.PROB_KYC, self.num_customers)
        
        df["is_kyc_identified"] = is_kyc_identified
        return df
    
    def is_customer_pep(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Method to randomly create a variable is_customer_pep which indicates if the customer is as Politically Exposed Persons.

        params: df: Dataframe with customers ids
        returns: df: Dataframe with user id and if identified
        """
        values_pep = [1,0]
        is_customer_pep = self.__assign_value(values_pep, self.PROB_PEP, self.num_customers)
        df["is_customer_pep"] = is_customer_pep
        return df
    
    def customer_region(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Method to determine the customer region. It can be selected frfom five categories: prohibited, high risk  restricted,
        high risk, medium risk, low risk.

        params: df: Dataframe with customers ids
        returns: df: Dataframe with user id and customer region
        """
        prohibited_countries = ["IRN", "PRK", "SYR"]
        high_restricted_risk_country = ["AFG", "BLR", "CUB", "SSD", "VEN"]
        high_risk_country = ["ARG", "AZE", "BRB", "ECU"]
        medium_risk_countries = ["BTN", "ISR", "IMN", "KWT", "LUX"]
        low_risk_countries = ["ESP", "NOR", "USA", "PRT", "FRA", "DEU"]

        all_countries = (
            prohibited_countries +
            high_restricted_risk_country +
            high_risk_country +
            medium_risk_countries +
            low_risk_countries
        )

        country_risk_map = {
        country: self.PROB_REGION [
            0 if country in prohibited_countries else
            1 if country in high_restricted_risk_country else
            2 if country in high_risk_country else
            3 if country in medium_risk_countries else
            4
        ]
        for country in all_countries
        }

        country_risk = self.__assign_value(list(country_risk_map.keys()), list(country_risk_map.values()), self.num_customers) 

        df["region"] = country_risk

        return df
    
    def is_employed(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Method to randomly determine the customer is employed or unemployed. 

        params: df: Dataframe with customers ids
        returns: df: Dataframe with user id and emplyment situation
        """

        values_employed = [1, 0]

        is_employed = self.__assign_value(values_employed, self.PROB_EMPLOYED, self.num_customers)

        df["is_employed"] = is_employed

        return df
    
    def industry_naic_code(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Method to randomly determine the customer´s naics code. 

        params: df: Dataframe with customers ids
        returns: df: Dataframe with user id and naic code
        """
        high_risk_naics = {
            "441110": "New car dealers",
            "441210": "Recreational Vehicle Dealers",
            "522390": "Other Activities Related to Credit Intermediation",
            "713210": "Casinos (except Casino Hotels) ",
        }
        low_risk_naics = {
            "111140": "Wheat Farming",
            "111920": "Cotton Farming",
            "541120": "Offices of Notaries",
            "541410": "Interior Design Services",
            "922160": "Fire Protection",
        }

        all_naics = (
            list(high_risk_naics.keys()) +
            list(low_risk_naics.keys())
        )

        naic_risk_map = {
        naic: self.PROB_RISK_NAIC [
                0 if naic in high_risk_naics.keys() else
                1
            ] 
        for naic in all_naics
        }

        naic_risk = self.__assign_value(list(naic_risk_map.keys()), list(naic_risk_map.values()), self.num_customers)

        df["naic_code"] = naic_risk

        return df

    def name_screaning_hit(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        """
        values_ns =  [1,0]

        ns_hit  = self.__assign_value(values_ns, self.PROB_NAME_SCREANING_HIT, self.num_customers)

        df["name_screaning_hit"] = ns_hit

        return df
    
    def adverse_media_screaning_hit(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        """
        values_ams =  [1,0]

        ams_hit  = self.__assign_value(values_ams, self.PROB_NAME_SCREANING_HIT, self.num_customers)

        df["adverse_media_screaning_hit"] = ams_hit

        return df
    
    def date_of_birth_minor(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        """

        # actual date and date from which the 3 groups of age  will be computed  (14 and 88 years)
        fecha_actual = datetime.datetime.now()
        fecha_limite_superior = fecha_actual - datetime.timedelta(days=14*365)
        fecha_limite_inferior = fecha_actual - datetime.timedelta(days=88*365)

        date_of_birth = []
        is_minor = []

        # Prob of being in the first group of age is 15%
        lim_joven = 0.15
        # Prob of being in the third group of age is 15%
        lim_mayor = 0.85

        for _ in range(2000000):
            num_random = random.random()

            if num_random < lim_joven:
                fecha = fecha_limite_superior + datetime.timedelta(days=random.randint(0, 4*365))
            elif num_random > lim_mayor:
                fecha = fecha_limite_inferior + datetime.timedelta(days=random.randint(1, 12*365))
            else:
                fecha = fecha_limite_superior - datetime.timedelta(days=random.randint(0, (fecha_limite_superior - fecha_limite_inferior).days))

            complete_years = (fecha_actual - fecha).days // 365.25
            minor = int(complete_years < 18)

            fecha_string = fecha.strftime("%d/%m/%Y")
            date_of_birth.append(fecha_string)
            is_minor.append(minor)

        df["date_of_birth"] = date_of_birth
        df["is_minor"] = is_minor

        return df

StatementMeta(, 6e36ff66-d28e-4275-8230-727f002245e4, 6, Finished, Available, Finished)

In [27]:
# Generador de transacciones
from random import choices, lognormvariate, choice
from itertools import chain
import time
import pandas as pd


class TransactionGeneration():

    def __init__(self, max_transacciones: int, chunk_size: int):
        self.max_transacciones = max_transacciones
        self.chunk_size = chunk_size
    
    # rescatamos los clientes para asigbar cada transacción.
    def fetch_customers(self, df):
        """
        """

        data_list = [row.customer_id for row in df.select("customer_id").collect()]    

        return data_list

    def produce_transaction(self, customers: list[str]):
        """ 
        transaction_id,
        user_id,
        cantidad,
        Hora,
        Origen,
        Destino,
        tipo de transaccion (retirada cash, ingreso cash, envio transfer, recibo transfer, cheque)
        Entrante o Saliente
        dia y fecha #TODO
        """
        # Countries for transactionality
        prohibited_countries = ["IRN", "PRK", "SYR"]
        high_restricted_risk_country = ["AFG", "BLR", "CUB", "SSD", "VEN"]
        high_risk_country = ["ARG", "AZE", "BRB", "ECU"]
        medium_risk_countries = ["BTN", "ISR", "IMN", "KWT", "LUX"]
        low_risk_countries = ["NOR", "USA", "PRT", "FRA", "DEU"]
        home = ["ESP"]
        prob_region = [0.01, 0.01, 0.01, 0.03, 0.05, 0.89]
        prob_transfer = [0.50, 0.05, 0.225, 0.225]

        # Countries and associated risk
        all_countries = (
                prohibited_countries +
                high_restricted_risk_country +
                high_risk_country +
                medium_risk_countries +
                low_risk_countries +
                home
                )   
        
        country_risk_map = {
            country: prob_region [
                0 if country in prohibited_countries else
                1 if country in high_restricted_risk_country else
                2 if country in high_risk_country else
                3 if country in medium_risk_countries else
                4 if country in low_risk_countries else 
                5
            ]
            for country in all_countries
            }

        # Transaction type.
        transaction_type = ["cash withdraw","cash top up", "transfer sent", "transfer recieved"]

        # Dataframe creation to store transactions
        columns = [
            "transaction_id",
            "date",
            "customer_id", 
            "quantity", 
            "transfer_type",
            "country_of_origin",
            "country_of_destiny",
            "inbound_outbound",
        ]
        transactions_df = pd.DataFrame(columns=columns)

        n = 0 # row number
        n_count = 0
        n_persist = 30
        total_transacciones = 0

        try:
            transaction_id = spark.sql("SELECT MAX(transaction_id) FROM transactions_raw").collect()[0][0]
        except:
            transaction_id = 0

        try:
            while True:
                n += 1 
                n_count += 1
                customer_id = choice(customers)
                quantity = round(lognormvariate(mu=5, sigma=3), 2)
                transfer_type = choices(transaction_type, prob_transfer, k=1)[0]
                country_of_origin = choices(list(country_risk_map.keys()), list(country_risk_map.values()), k=1)[0]
                country_of_destiny = choices(list(country_risk_map.keys()), list(country_risk_map.values()), k=1)[0]
                date = pd.Timestamp.now()
                transaction_id += 1
                
                if transfer_type in ["cash withdraw","cash top up"]:
                    country_of_destiny = country_of_origin
                
                inbound_outbound = "outbound" if transfer_type in  ["cash withdraw", "transfer sent"] else "inbound"

                

                # Enviamos datos al event stream
                if (country_of_origin in ["IRN", "PRK", "SYR"]) or (country_of_destiny in ["IRN", "PRK", "SYR"]):
                    transaction = {"transaction_id": transaction_id, "date": str(date),"customer_id": customer_id,"quantity": quantity,"transfer_type": transfer_type,"country_of_origin": country_of_origin,"country_of_destiny": country_of_destiny, "inbound_outbound": inbound_outbound}
                    transaction_json = json.dumps(transaction)
                    event_data_batch = producer.create_batch()
                    event_data_batch.add(EventData(body=transaction_json))
                    producer.send_batch(event_data_batch)


                transactions_df.loc[n] = [transaction_id, date, customer_id, quantity, transfer_type, country_of_origin, country_of_destiny, inbound_outbound]
                
                if n_count == self.chunk_size:
                    n_count = 0
                    n = 0
                    transactions_df_spark = spark.createDataFrame(transactions_df)
                    transactions_df_spark.write.format("delta").mode("append").save("Files/bronze/transactions")
                    transactions_df_spark.write.format("delta").mode("append").saveAsTable("transactions_raw")
                    print(f"Carga de la transaccion numero {total_transacciones + 1} finalizada")
                
                # Stopper criteria para no generar demasiadas transacciones
                total_transacciones += 1
                if total_transacciones == self.max_transacciones:
                    print("Transaciones maximas cargadas. Saliendo del programa.")
                    break

                # time.sleep(0.5)
        except KeyboardInterrupt:
            print("Generacion de transacciones finalizada")
        finally:
            # cerramos el cliente
            producer.close()



StatementMeta(, 1b1f9f77-5627-4bf2-b45b-e08f9306d116, 44, Finished, Available, Finished)

In [6]:
df = spark.sql("SELECT * FROM aml_tm.transactions_raw LIMIT 1000")

df.printSchema()

StatementMeta(, b1c192f0-af34-4ae2-b922-72d89e4ba06d, 10, Finished, Available, Finished)

root
 |-- transaction_id: long (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- quantity: double (nullable = true)
 |-- transfer_type: string (nullable = true)
 |-- country_of_origin: string (nullable = true)
 |-- country_of_destiny: string (nullable = true)
 |-- inbound_outbound: string (nullable = true)

