## Notebook Index
1. [Feature Store](https://app.snowflake.com/sfpscogs/rpegu_aiml/#/notebooks/ML_MODELS.DS.%2201_FeatureStore_Creation%22)  
2. [Feature Reduction](https://app.snowflake.com/sfpscogs/rpegu_aiml/#/notebooks/ML_MODELS.DS.%2202_Feature_Reduction%22)  
3. [Model Training](https://app.snowflake.com/sfpscogs/rpegu_aiml/#/notebooks/ML_MODELS.DS.%2203_Model_Training%22)  
4. [Model Inference & scheduling ](https://app.snowflake.com/sfpscogs/rpegu_aiml/#/notebooks/ML_MODELS.DS.%2204_Batch_Inferencing%22)



### What this notebook does
This notebook generates synthetic data for a demo use case. The dataset includes a member_id, a mix of numerical and categorical features, and a binary target variable.

Using Scikit-Learn’s make_classification, we generate:
* 150 features from make_classification (i.e., base features)
* 200 low-variance features
* 150 highly correlated features (based on the 150 base features)
* 5 categorical columns
* Missing data in selected columns
* A binary target column

In [None]:
# Import python packages
import pandas as pd

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()
print(f' Database Name :{session.get_current_database()}')
print(f' Schema Name :{session.get_current_schema()}')
print(f' warehouse Name :{session.get_current_warehouse()}')


In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

# Parameters
n_samples = 100000
base_features = 150
low_variance_features = 200
correlated_features = 150
total_numeric_features = base_features + low_variance_features + correlated_features

# Step 1: Generate base numerical features and target
X, y = make_classification(
    n_samples=n_samples,
    n_features=base_features,
    n_informative=60,
    n_redundant=60,
    n_repeated=0,
    n_classes=2,
    random_state=42,
    shuffle=False
)

# Create DataFrame for base features
df = pd.DataFrame(X, columns=[f'FEATURE_{i}' for i in range(base_features)])
df['TARGET'] = y

# Step 2: Add 200 Low-Variance Features
for i in range(1, low_variance_features + 1):
    if i == 1:
        df[f'FEATURE_LOW_VAR_{i}'] = 1  # Constant column
    else:
        df[f'FEATURE_LOW_VAR_{i}'] = np.random.choice([0, 1], size=n_samples, p=[0.98, 0.02])

# Step 3: Add 150 Highly Correlated Features
for i in range(1, correlated_features + 1):
    source_feature = f'FEATURE_{(i - 1) % base_features}'  # Cycle through base features
    df[f'FEATURE_CORR_{i}'] = df[source_feature] * 0.95 + np.random.normal(0, 0.01, n_samples)

# Step 4: Add 5 Specific Categorical Columns with realistic values
df['CAT_1'] = np.random.choice(['Male', 'Female'], size=n_samples)
df['CAT_2'] = np.random.choice(['online', 'retail'], size=n_samples)
df['CAT_3'] = np.random.choice(['tier_1', 'tier_2', 'tier_3'], size=n_samples)
df['CAT_4'] = np.random.choice(['credit', 'debit'], size=n_samples)
df['CAT_5'] = np.random.choice(['single', 'family'], size=n_samples)

# Step 5: Add Missing Values
def add_missing_values(df, cols, fraction=0.05):
    for col in cols:
        missing_indices = df.sample(frac=fraction, random_state=42).index
        df.loc[missing_indices, col] = np.nan
    return df

# Introduce missing values in numeric and categorical columns
numeric_missing = ['FEATURE_0', 'FEATURE_10', 'FEATURE_50', 'FEATURE_LOW_VAR_2', 'FEATURE_CORR_1']
categorical_missing = ['CAT_1', 'CAT_3', 'CAT_5']

df = add_missing_values(df, numeric_missing)
df = add_missing_values(df, categorical_missing)

# Step 6: Add MEMBER_ID Column
df['MEMBER_ID'] = [f'member_{i}' for i in range(len(df))]

# Step 7: Add REF_MMYY Column with random assignment of '042025' or '052025'
df['REF_MMYY'] = np.random.choice(['042025', '052025'], size=n_samples)

# Final shape check
print(f"Final Data Shape: {df.shape}")  # Should be (100000, ~507)

# Optional: Preview the data
# print(df.head())


In [None]:
print(f"Final Data Shape: {df.shape}") 

In [None]:
import numpy as np

# Identify the 500 numerical feature columns (excluding categorical, TARGET, MEMBER_ID, MMYYYY)
numeric_feature_cols = [col for col in df.columns 
                        if col.startswith('FEATURE_') or col.startswith('FEATURE_LOW_VAR_') or col.startswith('FEATURE_CORR_')]

# Sanity check
assert len(numeric_feature_cols) == 500, f"Expected 500 numeric features, found {len(numeric_feature_cols)}"

# Split numeric features into 4 equal parts
feature_splits = np.array_split(numeric_feature_cols, 4)

# Create four separate DataFrames with 125 features each + MEMBER_ID + REF_MMYY
df1 = df[list(feature_splits[0]) + ['MEMBER_ID', 'REF_MMYY','CAT_1','CAT_2']].copy()
df2 = df[list(feature_splits[1]) + ['MEMBER_ID', 'REF_MMYY','CAT_3']].copy()
df3 = df[list(feature_splits[2]) + ['MEMBER_ID', 'REF_MMYY','CAT_4']].copy()
df4 = df[list(feature_splits[3]) + ['MEMBER_ID', 'REF_MMYY','CAT_5']].copy()

# Print shapes for verification
print(f"df1 shape: {df1.shape}")
print(f"df2 shape: {df2.shape}")
print(f"df3 shape: {df3.shape}")
print(f"df4 shape: {df4.shape}")


In [None]:

# Get remaining columns in df that are not in any of df1 to df4
r_cols = ['MEMBER_ID', 'REF_MMYY','TARGET']

df_main = df[r_cols].copy()
print(f"df_main shape: {df_main.shape}")


In [None]:
def write_table(session,df,snf_tbl):
    sdf = session.create_dataframe(df)
    sdf.write.mode("overwrite").save_as_table(snf_tbl, table_type="transient")


write_table(session,df1,'DEMO_TBL_1')
write_table(session,df2,'DEMO_TBL_2')  
write_table(session,df3,'DEMO_TBL_3')
write_table(session,df4,'DEMO_TBL_4')
write_table(session,df_main,'DEMO_TARGETS_TBL')