In [1]:
# Importing necessary libraries

import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from pymongo import MongoClient
from datetime import datetime

# Extraction

In [2]:
# Importing the data
def extract_data():
    '''
    Reading data from the MongoDB (data lake)
    '''
    client = MongoClient('mongodb+srv://username:password@mychurndb.bjfry.mongodb.net/')
    # Select the customer_churn database
    db = client['customer_churn']
    
    # Select the churndb collection
    collection = db['churndb']

    df = pd.DataFrame(list(collection.find()))

    df = df.drop(columns = ['_id','customer_id','Name','security_no','referral_id'])
    
    return df
    

# Transformation

In [3]:
def transform_data(df):
    '''
    Performing data tranformation on the input (df).
    '''
    
    # Convert data to appropriate datatypes
    numerical_columns = ['age','days_since_last_login','avg_time_spent',
                         'avg_transaction_value','avg_frequency_login_days',
                         'points_in_wallet','churn_risk_score']
    df[numerical_columns] = df[numerical_columns].apply(pd.to_numeric,errors='coerce')
    df['last_visit_time'] = pd.to_datetime(df['last_visit_time'],format='%H:%M:%S')
    categorical_columns = ['gender','region_category','membership_category','joined_through_referral',
                           'preferred_offer_types','medium_of_operation','internet_option','used_special_discount',
                           'offer_application_preference','past_complaint','complaint_status','feedback']
    df[categorical_columns] = df[categorical_columns].astype('object')
    df['joining_date'] = pd.to_datetime(df['joining_date'])

    # Impute missing values
    target_column = 'churn_risk_score'
    numeric_columns = df.select_dtypes(include='number').columns.drop(target_column)
    scaler = StandardScaler()
    df_scaled = df.copy()
    df_scaled[numeric_columns] = scaler.fit_transform(df[numeric_columns])
    iterative_imputer = IterativeImputer(random_state=42)
    df_scaled[numeric_columns] = iterative_imputer.fit_transform(df_scaled[numeric_columns])
    df[numeric_columns] = scaler.inverse_transform(df_scaled[numeric_columns])

    # KNN Imputer for categorical columns
    df['gender'] = df['gender'].replace('Unknown',np.nan)
    categorical_columns = ['gender','region_category','joined_through_referral',
                           'medium_of_operation','preferred_offer_types']

    ## Handling ''
    df[categorical_columns] = df[categorical_columns].replace('', np.nan)

    encoder = OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1)
    encoder.fit(df[categorical_columns])
    df[categorical_columns] = encoder.transform(df[categorical_columns])
    imputer = KNNImputer(n_neighbors=5, metric='nan_euclidean',weights='distance')
    df[categorical_columns] = imputer.fit_transform(df[categorical_columns])
    df[categorical_columns] = encoder.inverse_transform(df[categorical_columns])
    for col in categorical_columns:
        df[col] = df[col].astype('object')

    # Feature Engineering
    specific_date = datetime(2024, 10 , 2)
    df['tenure_months'] = ((specific_date.year - df['joining_date'].dt.year) * 12 +
                           (specific_date.month - df['joining_date'].dt.month)).astype('int64')
    df['visit_hour'] = df['last_visit_time'].dt.hour.astype('int64')
    df['login_spend_ratio'] = df['avg_time_spent']/df['avg_frequency_login_days']
    df['login_transaction_ratio'] = df['avg_frequency_login_days']/df['avg_transaction_value']

    # Target column class distribution
    mapping ={
        -1: 0,
        1: 0,
        2: 0,
        3: 1,
        4: 2,
        5: 2
    }
    df['churn_risk_score'] = df['churn_risk_score'].map(mapping)

    df = df.drop(columns = ['joining_date', 'last_visit_time'])

    # Rename columns
    rename_mapping = {
        'avg_frequency_login_days': 'frequency',
        'avg_transaction_value': 'monetary',
        'days_since_last_login': 'recency'        
    }
    df = df.rename(columns=rename_mapping)

    return df

# Load

In [6]:
def load_data_to_mongo(df):
    '''
    Saves the transformed DataFrame to a new MongoDB collection.
    '''

    client = MongoClient('mongodb+srv://username:password@mychurndb.bjfry.mongodb.net/')
    # Select the customer_churn database
    db = client['customer_churn']
    
    # New collection
    collection = db['transformed_churndb']

    # Convert DataFrame to dictionary and insert into MongoDB
    data_dict = df.to_dict('records')
    collection.insert_many(data_dict)
    print("Data saved to MongoDB collection 'transformed_churndb'")


# Main

In [7]:
if __name__=='__main__':
    df = extract_data()
    df = transform_data(df)
    load_data_to_mongo(df)

Data saved to MongoDB collection 'transformed_churndb'
