In [None]:
USE WAREHOUSE CHURN_PRED;
USE DATABASE CHURN_DB;
USE SCHEMA PUBLIC;


In [None]:
SELECT * FROM CHURN_DATA;

In [None]:
SELECT CURRENT_ACCOUNT();

In [None]:
import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import LabelEncoder, StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = cell5.to_df()
df.show(5)

In [None]:
df.describe()

In [None]:
df2 = df.to_pandas()

In [None]:
df2.info()

In [None]:
df2.isnull().sum()

In [None]:
df2 = df2.drop(['CUSTOMERID'], axis = 1)

In [None]:
df2[np.isnan(df2['TOTALCHARGES'])]

In [None]:
df2.fillna(df2["TOTALCHARGES"].mean())

In [None]:
df2[df2['TENURE'] == 0].index

In [None]:
df2.drop(labels=df2[df2['TENURE'] == 0].index, axis=0, inplace=True)
df2[df2['TENURE'] == 0].index

In [None]:
df2.isnull().sum()

In [None]:
df2.info()

In [None]:
df2.head()

In [None]:
df2.info()

In [None]:
# Encode categorical features
df2['GENDER'] = LabelEncoder().fit_transform(df2['GENDER'])
df2['PARTNER'] = LabelEncoder().fit_transform(df2['PARTNER'])
df2['DEPENDENTS'] = LabelEncoder().fit_transform(df2['DEPENDENTS'])
df2['PHONESERVICE'] = LabelEncoder().fit_transform(df2['PHONESERVICE'])
df2['MULTIPLELINES'] = LabelEncoder().fit_transform(df2['MULTIPLELINES'])
df2['INTERNETSERVICE'] = LabelEncoder().fit_transform(df2['INTERNETSERVICE'])
df2['ONLINESECURITY'] = LabelEncoder().fit_transform(df2['ONLINESECURITY'])
df2['ONLINEBACKUP'] = LabelEncoder().fit_transform(df2['ONLINEBACKUP'])
df2['DEVICEPROTECTION'] = LabelEncoder().fit_transform(df2['DEVICEPROTECTION'])
df2['PAYMENTMETHOD'] = LabelEncoder().fit_transform(df2['PAYMENTMETHOD'])
df2['TECHSUPPORT'] = LabelEncoder().fit_transform(df2['TECHSUPPORT'])
df2['STREAMINGTV'] = LabelEncoder().fit_transform(df2['STREAMINGTV'])
df2['STREAMINGMOVIES'] = LabelEncoder().fit_transform(df2['STREAMINGMOVIES'])
df2['CONTRACT'] = LabelEncoder().fit_transform(df2['CONTRACT'])
df2['PAPERLESSBILLING'] = LabelEncoder().fit_transform(df2['PAPERLESSBILLING'])
df2['CHURN'] = LabelEncoder().fit_transform(df2['CHURN'])

In [None]:
df2.head()

In [None]:
# Handle missing values
df2['TOTALCHARGES'] = pd.to_numeric(df2['TOTALCHARGES'], errors='coerce')
df2 = df2.dropna()

# Normalize numerical columns
scaler = StandardScaler()
df2[['TENURE', 'MONTHLYCHARGES', 'TOTALCHARGES']] = scaler.fit_transform(df2[['TENURE', 'MONTHLYCHARGES', 'TOTALCHARGES']])


In [None]:
df2.head()

In [None]:
df2

In [None]:
CREATE OR REPLACE TABLE CHURN_FEATURES (
    GENDER INT,
    SENIOR_CITIZEN INT,
    PARTNER INT,
    DEPENDENTS INT,
    TENURE FLOAT,
    PHONESERVICE INT,
    MULTIPLELINES INT,
    INTERNETSERVICE INT,
    ONLINESECURITY INT,
    ONLINEBACKUP INT,
    DEVICEPROTECTION INT,
    TECHSUPPORT INT,
    STREAMINGTV INT,
    STREAMINGMOVIES INT,
    CONTRACT INT,
    PAPERLESSBILLING INT,
    PAYMENTMETHOD INT,
    MONTHLY_CHARGES FLOAT,
    TOTAL_CHARGES FLOAT,
    CHURN INT
);

In [None]:
from snowflake.snowpark import Session
from snowflake.snowpark.context import get_active_session

session = get_active_session()

df_snowpark = session.create_dataframe(df2)

In [None]:
df_snowpark.write.mode("overwrite").save_as_table("CHURN_FEATURES")

In [None]:
SELECT * FROM CHURN_FEATURES LIMIT 10;


In [None]:
features_df = session.table("CHURN_FEATURES")
features_df

In [None]:
features_df.schema

In [None]:
from snowflake.ml.modeling.xgboost import XGBClassifier
from snowflake.ml.modeling.metrics import roc_auc_score

In [None]:

TARGET_COL = "CHURN" 
FEATURE_COLS = [col for col in features_df.columns if col != TARGET_COL]


In [None]:
train_df, test_df = features_df.random_split([0.8, 0.2], seed=42)

### XG BOOST

In [None]:
model = XGBClassifier(
    input_cols=FEATURE_COLS,
    label_cols=TARGET_COL,
    output_cols="PREDICTION",
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1
)

model.fit(train_df)


In [None]:
predictions = model.predict(test_df)

In [None]:
auc_score = roc_auc_score(
    df=predictions,
    y_true_col_names=TARGET_COL,
    y_score_col_names="PREDICTION"
)

In [None]:
print(f"\nModel Evaluation:")
print(f"ROC AUC Score: {auc_score:.4f}")

### RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

local_df = features_df.to_pandas()
X = local_df[FEATURE_COLS]
y = local_df[TARGET_COL]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
rf_model = RandomForestClassifier().fit(X_train, y_train)

In [None]:
y_pred_rf = rf_model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_rf))