#### Upload Data from S3 to PostgreSQL

In [10]:
import boto3
import pandas as pd
import psycopg2
from io import StringIO
from sqlalchemy import create_engine, text
import glob
import os
from dotenv import load_dotenv


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# For display
pd.set_option('display.max_columns', None)

In [13]:
# PostgreSQL Configuration
db_params = {
    "host": os.getenv("DB_HOST"),
    "port": os.getenv("DB_PORT"),
    "dbname": os.getenv("DB_NAME"),
    "user": os.getenv("DB_USER"),
    "password": os.getenv("DB_PASSWORD"),
    "sslmode": os.getenv("DB_SSLMODE")
}


# Load env credentials for PostgreSQL
load_dotenv()

db_url = (
    f"postgresql://{os.getenv('DB_USER')}:{os.getenv('DB_PASSWORD')}@"
    f"{os.getenv('DB_HOST')}:{os.getenv('DB_PORT')}/{os.getenv('DB_NAME')}"
)
engine = create_engine(db_url)


# --- S3 CONFIG (pandas + s3fs handles this internally) ---
bucket = "finriskai"
prefix = "datasets/"
files = {
    "applications": f"s3://{bucket}/{prefix}credit_applications.csv",
    "bureau": f"s3://{bucket}/{prefix}credit_bureau_data.csv",
    "profiles": f"s3://{bucket}/{prefix}customer_profiles.csv",
    "predictions": f"s3://{bucket}/{prefix}model_predictions.csv",
    "transactions": f"s3://{bucket}/{prefix}transaction_data.csv",
}


# READ DATA FROM S3 DIRECTLY INTO DATAFRAMES
applications = pd.read_csv(files["applications"], storage_options={"anon": False})
bureau = pd.read_csv(files["bureau"], storage_options={"anon": False})
profiles = pd.read_csv(files["profiles"], storage_options={"anon": False})
predictions = pd.read_csv(files["predictions"], storage_options={"anon": False})
transactions = pd.read_csv(files["transactions"], storage_options={"anon": False})

All datasets loaded from S3 and inserted into PostgreSQL successfully.


In [None]:
with engine.begin() as conn:
    # Dimension: Customer Profiles
    conn.execute(text("""
    CREATE TABLE IF NOT EXISTS dim_customer_profiles (
        customer_id VARCHAR PRIMARY KEY,
        customer_age INT,
        annual_income NUMERIC(15,2),
        employment_status VARCHAR,
        account_tenure INT,
        product_holdings INT,
        relationship_value NUMERIC(15,2),
        risk_segment VARCHAR,
        behavioral_score NUMERIC(10,2),
        credit_score INT,
        city VARCHAR,
        last_activity_date DATE
    );
    """))

    # Dimension: Bureau Data
    conn.execute(text("""
    CREATE TABLE IF NOT EXISTS dim_bureau (
        customer_id VARCHAR PRIMARY KEY,
        credit_score INT,
        credit_history_length INT,
        number_of_accounts INT,
        total_credit_limit NUMERIC(15,2),
        credit_utilization NUMERIC(6,3),
        payment_history NUMERIC(6,3),
        public_records INT
    );
    """))

    # Fact: Applications
    conn.execute(text("""
    CREATE TABLE IF NOT EXISTS fact_applications (
        application_id VARCHAR PRIMARY KEY,
        customer_id VARCHAR REFERENCES dim_customer_profiles(customer_id),
        application_date DATE,
        loan_amount NUMERIC(15,2),
        loan_purpose VARCHAR,
        employment_status VARCHAR,
        annual_income NUMERIC(15,2),
        debt_to_income_ratio NUMERIC(6,3),
        credit_score INT,
        application_status VARCHAR,
        default_flag INT
    );
    """))

    # Fact: Predictions
    conn.execute(text("""
    CREATE TABLE IF NOT EXISTS fact_predictions (
        prediction_id VARCHAR PRIMARY KEY,
        model_version VARCHAR,
        customer_id VARCHAR REFERENCES dim_customer_profiles(customer_id),
        prediction_date DATE,
        prediction_type VARCHAR,
        risk_score NUMERIC(10,2),
        fraud_probability NUMERIC(6,3),
        model_features JSONB,
        prediction_explanation TEXT,
        business_decision VARCHAR,
        actual_outcome VARCHAR
    );
    """))

    # Fact: Transactions
    conn.execute(text("""
    CREATE TABLE IF NOT EXISTS fact_transactions (
        transaction_id VARCHAR PRIMARY KEY,
        customer_id VARCHAR REFERENCES dim_customer_profiles(customer_id),
        transaction_date TIMESTAMP,
        amount NUMERIC(15,2),
        merchant_category VARCHAR,
        transaction_type VARCHAR,
        location VARCHAR,
        device_info VARCHAR,
        fraud_flag INT,
        investigation_status VARCHAR
    );
    """))


    # --- UPLOAD DATA TO POSTGRES ---

    applications.to_sql("fact_applications", con=engine, if_exists="append", index=False)
    bureau.to_sql("dim_bureau", con=engine, if_exists="append", index=False)
    profiles.to_sql("dim_customer_profiles", con=engine, if_exists="append", index=False)
    predictions.to_sql("fact_predictions", con=engine, if_exists="append", index=False)
    transactions.to_sql("fact_transactions", con=engine, if_exists="append", index=False)

    print("All datasets loaded from S3 and inserted into PostgreSQL successfully.")