# Initiate Connection

In [None]:
from snowflake.snowpark import Session

# Automatically get the current Snowflake session (no credentials needed)
session = Session.builder.getOrCreate()
# Select your working database and schema
session.sql("USE DATABASE churn_modeling_db").collect()
session.sql("USE SCHEMA churn_modeling_schema").collect()

# Load data from the table created earlier
df = session.table("customer_data")

# Show a sample
df.show(10)


# Exploratory data analysis

In [None]:
from snowflake.snowpark import Session
import snowflake.snowpark.functions as F
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

# Assuming df is already your Snowflake table
# df = session.table("customer_data")  # you already have this

# --- Basic EDA ---

# Show schema
print("=== Schema ===")
df.print_schema()

# Count total rows
print("\n=== Total rows ===")
print(df.count())

# Show first few rows
print("\n=== First 10 rows ===")
df.show(10)

# Describe numeric columns
print("\n=== Summary statistics (numeric) ===")
numeric_cols = ["CreditScore", "Age", "Grade", "AccountBalance", "ProductCount", "SalaryEstimated"]
df.select(numeric_cols).describe().show()

# Check for nulls
print("\n=== Missing values per column ===")
null_counts = df.select([F.count(F.when(F.col(c).is_null(), c)).alias(c) for c in df.columns])
null_counts.show()

# Value counts for categorical columns
categorical_cols = ["Geography", "Gender", "OwnsCreditCard", "IsActive", "Churned"]
for col in categorical_cols:
    print(f"\n=== Value counts for {col} ===")
    df.group_by(col).count().order_by(F.desc("count")).show()

# --- Convert to Pandas for plotting ---
df_pd = df.to_pandas()

# --- Plots with correct column names from Snowflake (uppercase) ---

# Age distribution
plt.figure(figsize=(8, 5))
sns.histplot(df_pd["AGE"], bins=30, kde=True)
plt.title("Age Distribution")
plt.show()

# AccountBalance distribution
plt.figure(figsize=(8, 5))
sns.histplot(df_pd["ACCOUNTBALANCE"], bins=30, kde=True)
plt.title("Account Balance Distribution")
plt.show()

# Churn by Geography
plt.figure(figsize=(8, 5))
sns.countplot(x="GEOGRAPHY", hue="CHURNED", data=df_pd)
plt.title("Churn by Geography")
plt.show()

# Churn by Gender
plt.figure(figsize=(6, 4))
sns.countplot(x="GENDER", hue="CHURNED", data=df_pd)
plt.title("Churn by Gender")
plt.show()

# CreditScore vs Churn
plt.figure(figsize=(8, 5))
sns.boxplot(x="CHURNED", y="CREDITSCORE", data=df_pd)
plt.title("Credit Score by Churn Status")
plt.show()

# AccountBalance vs Churn
plt.figure(figsize=(8, 5))
sns.boxplot(x="CHURNED", y="ACCOUNTBALANCE", data=df_pd)
plt.title("Account Balance by Churn Status")
plt.show()

# Grade vs Churn
plt.figure(figsize=(8, 5))
sns.boxplot(x="CHURNED", y="GRADE", data=df_pd)
plt.title("Grade by Churn Status")
plt.show()

# ProductCount vs Churn
plt.figure(figsize=(8, 5))
sns.countplot(x="PRODUCTCOUNT", hue="CHURNED", data=df_pd)
plt.title("Number of Products by Churn")
plt.show()

# OwnsCreditCard
