In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
import warnings




In [18]:
data_df= pd.read_csv("../raw_data/churn_df_underbalanced.csv")

## Import data

In [19]:
data_df = data_df.sample(frac=1, random_state=42)

In [20]:
data_df = data_df.drop(['Unnamed: 0','msno', 'bd', 'payment_method_id', 'city', 'registered_via'], axis=1)

## Which scaler to use for which feature

In [21]:
robust_features = [               'remaining_plan_duration', 
          'usage_from_ltd',
              'payment_plan_days',
              'plan_list_price',
              'actual_amount_paid',
             'num_50',
              'num_75',
              'num_985',
              
              'expire_year',
              'last_transaction_year'
             ]
minmax_features = ['registration_year']
normal_features = [ 'num_25',
              
              'num_100',
              'num_unq',
              'total_secs',]

## Pipeline

In [22]:
# Build the pipeline with the different steps
robust_pipeline = make_pipeline(RobustScaler())
minmax_pipeline = make_pipeline(MinMaxScaler())
log_pipeline = make_pipeline( 
    FunctionTransformer(np.log1p, validate=True),
    StandardScaler())

preprocessor_new = ColumnTransformer(
    transformers=[
        ('Robust', robust_pipeline, robust_features),
        ('MinMax', minmax_pipeline, minmax_features),
        ('Log', log_pipeline, normal_features)
    ], remainder='passthrough'
)
    


In [23]:
preprocessor_new

In [24]:
#create X and y
X = data_df.drop(['is_churn'], axis=1)
y= data_df['is_churn']

In [25]:
# Use fit on X_train and then transform both datasets
preprocessor_new.fit(X)
X_train_transformed = preprocessor_new.transform(X)

In [26]:
# Convert transformed data back to DataFrame to ensure column order is maintained
X_train_transformed = pd.DataFrame(
    X_train_transformed, 
    columns=X.columns
)

## Logistic Regresssion model

In [27]:
model = LogisticRegression(C= 2.58069,
                           solver='liblinear',
 max_iter= 2,
penalty= 'l1',
 class_weight= 'balanced')

model_no_hyperparams = LogisticRegression()

cv_results = cross_validate(model, X_train_transformed, y, cv=5, scoring='precision')

precision = cv_results['test_score'].mean()

warnings.filterwarnings('ignore')


In [28]:
cv_results

{'fit_time': array([0.05884385, 0.03540301, 0.05247521, 0.04340196, 0.03587604]),
 'score_time': array([0.00842595, 0.00480223, 0.01079893, 0.00376892, 0.0036819 ]),
 'test_score': array([0.95571679, 0.97522961, 0.95062924, 0.97351367, 0.963241  ])}

In [29]:
precision

0.9636660612373207