In [18]:
!pip install snowflake-snowpark-python



In [25]:
from snowflake.snowpark import Session
from snowflake.snowpark import functions as F
from snowflake.snowpark.functions import year, month, to_date, lit
from snowflake.snowpark.functions import col, avg, stddev_pop, corr, sum as sp_sum
import numpy as np
import pandas as pd
from snowflake.snowpark.exceptions import SnowparkSQLException
import statsmodels.api as sm


In [20]:
parameters = {
...    "account": "YEZEPEO-DATASCIENCE",
...    "user": "VNSHARED",
...    "password": "Columbia2024!",
...    "role": "SYSADMIN", # optional
...    "warehouse": "ANALYSIS", # optional
...   "database": "MEIXI_DATA", # optional
...   "schema":"PUBLIC",  # optional
...  }
session= Session.builder.configs(parameters).create()

In [21]:
# Load the first DataFrame
df_v = session.sql('SELECT * FROM CONSOLIDATED_COPY')
df_v = df_v.select([col(col_name).alias(col_name.replace('"', '')) for col_name in df_v.columns])
#df_v = df_v.select("FECHAEMISION", "DIASCOMPENSACION")

# Load the second DataFrame
df_eq = session.sql('SELECT * FROM EQUIFAX_COPY')
df_eq = df_eq.select([col(col_name).alias(col_name.replace('"', '')) for col_name in df_eq.columns])

In [22]:
# join on the "RUT" column
df_joined = df_v.join(df_eq, df_v["RUT"] == df_eq["RUT"])
df_joined.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Logistic Regression: Default vs. Equifax

In [23]:
# Prepare the data by marking defaults as 1 and non-defaults as 0
df_joined = df_joined.withColumn('Is_Default',
                                 (col("STATUSCOMPENSACION") == "Impaga") | (col("STATUSCOMPENSACION") == "Castigado").cast("int"))


In [27]:
# Group by 'Score Equifax', calculate total individuals and defaults in each group
# Cast 'Is_Default' to integer before summing
grouped = df_joined.groupBy("Score Equifax").agg(
    count(col("Is_Default")).alias("total"),
    sp_sum(col("Is_Default").cast("int")).alias("defaults")
)

# Calculate default rate and include total as weight
grouped = grouped.withColumn("default_rate", col("defaults") / col("total"))
grouped = grouped.withColumn("weight", col("total"))

# Since Snowpark DataFrames are lazy and operations are not executed until an action is performed,
# converting to a pandas DataFrame will trigger the actual computations.
grouped_df = grouped.toPandas()

# Check the DataFrame
print(grouped_df.head())

   Score Equifax   TOTAL  DEFAULTS  DEFAULT_RATE  WEIGHT
0          984.0  364664      6154      0.016876  364664
1          819.0  917954    120319      0.131073  917954
2          428.0  395410     64095      0.162098  395410
3          549.0  567914     85609      0.150743  567914
4          774.0  416580     25311      0.060759  416580


Logistic regression (weighted): Equifax on default rate (group)

In [31]:
# Check for NaNs or infinite values in the DataFrame
print(grouped_df.isnull().sum())  # Check for NaN values
print(np.isinf(grouped_df).sum())  # Check for infinite values

# Drop rows where any of the necessary columns contain NaN or infinite values
grouped_df = grouped_df.replace([np.inf, -np.inf], np.nan)  # Replace infinities with NaN
grouped_df = grouped_df.dropna(subset=['Score Equifax', 'DEFAULT_RATE', 'WEIGHT'])  # Drop rows with NaN values


Score Equifax    1
TOTAL            0
DEFAULTS         0
DEFAULT_RATE     0
WEIGHT           0
dtype: int64
Score Equifax    0
TOTAL            0
DEFAULTS         0
DEFAULT_RATE     0
WEIGHT           0
dtype: int64


In [32]:
# Prepare data for regression
X = grouped_df[['Score Equifax']]  # Predictor
X = sm.add_constant(X)  # Adds a constant term to the predictor matrix
y = grouped_df['DEFAULT_RATE']  # Response variable
weights = grouped_df['WEIGHT']  # Observation weights

# Fit a weighted logistic regression model
try:
    model = sm.WLS(y, X, weights=weights)  # Using Weighted Least Squares for weighted regression
    result = model.fit()

    # Print the summary of the regression model
    print(result.summary())
except Exception as e:
    print(f"An error occurred during model fitting: {str(e)}")


                            WLS Regression Results                            
Dep. Variable:           DEFAULT_RATE   R-squared:                       0.748
Model:                            WLS   Adj. R-squared:                  0.747
Method:                 Least Squares   F-statistic:                     2671.
Date:                Fri, 03 May 2024   Prob (F-statistic):          9.80e-272
Time:                        10:06:02   Log-Likelihood:                 1457.6
No. Observations:                 903   AIC:                            -2911.
Df Residuals:                     901   BIC:                            -2902.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const             0.2543      0.003     96.031

In [None]:
# 1 point increase in Equifax score --> 0.02% decrease in the default rate
# 50 point increase in Equifax score --> 1% decrease in the default rate

# Problem: Residuals not normally correlated: high JB and Omnibus with p near 0. High Skew and Kurtoisis


In [33]:
session.close