In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
import warnings




In [2]:
data_df= pd.read_csv("../notebooks/.test_transformed_df1.csv")

## Import data

In [3]:
data_df = data_df.sample(frac=1, random_state=42)

In [4]:
data_df = data_df.drop(['Unnamed: 0','msno', 'bd', 'payment_method_id', 'city', 'registered_via'], axis=1)

## Which scaler to use for which feature

In [5]:
robust_features = [               'remaining_plan_duration', 
          'usage_from_ltd',
              'payment_plan_days',
              'plan_list_price',
              'actual_amount_paid',
             'num_50',
              'num_75',
              'num_985',
              
              'expire_year',
              'last_transaction_year'
             ]
minmax_features = ['registration_year']
normal_features = [ 'num_25',
              
              'num_100',
              'num_unq',
              'total_secs',]

## Pipeline

In [6]:
# Build the pipeline with the different steps
robust_pipeline = make_pipeline(RobustScaler())
minmax_pipeline = make_pipeline(MinMaxScaler())
log_pipeline = make_pipeline( 
    FunctionTransformer(np.log1p, validate=True),
    StandardScaler())

preprocessor_new = ColumnTransformer(
    transformers=[
        ('Robust', robust_pipeline, robust_features),
        ('MinMax', minmax_pipeline, minmax_features),
        ('Log', log_pipeline, normal_features)
    ], remainder='passthrough'
)
    


In [7]:
preprocessor_new

In [8]:
#create X and y
X = data_df.drop(['is_churn'], axis=1)
y= data_df['is_churn']

In [10]:
X

Unnamed: 0,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,is_cancel,remaining_plan_duration,is_discount,num_25,num_50,num_75,...,expire_year,expire_month_sin,expire_month_cos,expire_day_sin,expire_day_cos,registration_year,registration_month_sin,registration_month_cos,registration_day_sin,registration_day_cos
39371,7.0,0.0,0.0,0.0,0.0,451,0,69.0,16.0,10.0,...,2018,0.500000,-8.660254e-01,-0.299363,-0.954139,2016,5.000000e-01,8.660254e-01,0.571268,0.820763
20602,30.0,149.0,149.0,1.0,0.0,30,0,6.0,2.0,4.0,...,2017,0.866025,-5.000000e-01,-0.998717,-0.050649,2006,5.000000e-01,-8.660254e-01,0.998717,-0.050649
29020,30.0,180.0,180.0,0.0,0.0,32,0,19.0,5.0,5.0,...,2017,0.866025,-5.000000e-01,-0.485302,-0.874347,2016,8.660254e-01,5.000000e-01,0.101168,-0.994869
13871,30.0,149.0,149.0,1.0,0.0,51,0,1.0,1.0,0.0,...,2017,0.500000,-8.660254e-01,-0.897805,-0.440394,2007,1.224647e-16,-1.000000e+00,-0.485302,-0.874347
44798,30.0,180.0,180.0,1.0,1.0,-2,0,0.0,1.0,0.0,...,2017,1.000000,6.123234e-17,0.998717,-0.050649,2009,1.000000e+00,6.123234e-17,-0.937752,0.347305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44732,30.0,149.0,149.0,0.0,0.0,33,0,386.0,112.0,69.0,...,2017,0.866025,-5.000000e-01,0.201299,0.979530,2015,5.000000e-01,8.660254e-01,0.790776,-0.612106
54343,30.0,149.0,149.0,1.0,1.0,1,0,6.0,4.0,4.0,...,2017,1.000000,6.123234e-17,-0.988468,0.151428,2016,1.224647e-16,-1.000000e+00,-0.201299,0.979530
38158,30.0,99.0,99.0,1.0,1.0,0,0,7.0,1.0,4.0,...,2017,1.000000,6.123234e-17,-0.394356,0.918958,2012,-2.449294e-16,1.000000e+00,-0.988468,0.151428
860,30.0,99.0,99.0,1.0,0.0,31,0,123.0,22.0,8.0,...,2017,0.866025,-5.000000e-01,-0.897805,-0.440394,2012,5.000000e-01,-8.660254e-01,-0.101168,-0.994869


In [9]:
# Use fit on X_train and then transform both datasets
preprocessor_new.fit(X)
X_train_transformed = preprocessor_new.transform(X)

In [11]:
X_train_transformed

array([[ 1.61538462e+01, -2.64705882e-01, -2.30000000e+01, ...,
         8.66025404e-01,  5.71268215e-01,  8.20763441e-01],
       [-3.84615385e-02,  4.85294118e-01,  0.00000000e+00, ...,
        -8.66025404e-01,  9.98716507e-01, -5.06491688e-02],
       [ 3.84615385e-02,  0.00000000e+00,  0.00000000e+00, ...,
         5.00000000e-01,  1.01168322e-01, -9.94869323e-01],
       ...,
       [-1.19230769e+00,  5.33823529e+00,  0.00000000e+00, ...,
         1.00000000e+00, -9.88468324e-01,  1.51427778e-01],
       [ 0.00000000e+00,  2.38235294e+00,  0.00000000e+00, ...,
        -8.66025404e-01, -1.01168322e-01, -9.94869323e-01],
       [ 0.00000000e+00, -1.76470588e-01,  0.00000000e+00, ...,
         6.12323400e-17, -2.99363123e-01, -9.54139256e-01]])

In [12]:
# Convert transformed data back to DataFrame to ensure column order is maintained
X_train_transformed = pd.DataFrame(
    X_train_transformed, 
    columns=X.columns
)

In [13]:
X_train_transformed

Unnamed: 0,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,is_cancel,remaining_plan_duration,is_discount,num_25,num_50,num_75,...,expire_year,expire_month_sin,expire_month_cos,expire_day_sin,expire_day_cos,registration_year,registration_month_sin,registration_month_cos,registration_day_sin,registration_day_cos
0,16.153846,-0.264706,-23.0,-7.45,-3.040816,0.588235,0.545455,0.333333,1.0,0.0,...,-7.907757e-01,-0.612106,0.500000,-8.660254e-01,-0.299363,-0.954139,5.000000e-01,8.660254e-01,0.571268,0.820763
1,-0.038462,0.485294,0.0,0.00,0.000000,-0.235294,0.000000,-0.333333,0.0,0.0,...,-9.884683e-01,0.151428,0.866025,-5.000000e-01,-0.998717,-0.050649,5.000000e-01,-8.660254e-01,0.998717,-0.050649
2,0.038462,0.000000,0.0,1.55,0.632653,-0.058824,0.090909,0.166667,0.0,0.0,...,-2.993631e-01,-0.954139,0.866025,-5.000000e-01,-0.485302,-0.874347,8.660254e-01,5.000000e-01,0.101168,-0.994869
3,0.769231,-0.382353,0.0,0.00,0.000000,-0.294118,-0.363636,-0.333333,0.0,0.0,...,-2.449294e-16,1.000000,0.500000,-8.660254e-01,-0.897805,-0.440394,1.224647e-16,-1.000000e+00,-0.485302,-0.874347
4,-1.269231,-0.397059,0.0,1.55,0.632653,-0.294118,-0.363636,-0.333333,0.0,0.0,...,8.978045e-01,-0.440394,1.000000,6.123234e-17,0.998717,-0.050649,1.000000e+00,6.123234e-17,-0.937752,0.347305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55989,0.076923,2.529412,0.0,0.00,0.000000,6.235294,5.909091,4.833333,0.0,0.0,...,-7.247928e-01,0.688967,0.866025,-5.000000e-01,0.201299,0.979530,5.000000e-01,8.660254e-01,0.790776,-0.612106
55990,-1.153846,8.470588,0.0,0.00,0.000000,-0.117647,0.000000,-0.083333,0.0,0.0,...,-9.987165e-01,-0.050649,1.000000,6.123234e-17,-0.988468,0.151428,1.224647e-16,-1.000000e+00,-0.201299,0.979530
55991,-1.192308,5.338235,0.0,-2.50,-1.020408,-0.294118,0.000000,-0.250000,0.0,0.0,...,-3.943559e-01,0.918958,1.000000,6.123234e-17,-0.394356,0.918958,-2.449294e-16,1.000000e+00,-0.988468,0.151428
55992,0.000000,2.382353,0.0,-2.50,-1.020408,0.941176,0.363636,1.000000,0.0,0.0,...,-8.978045e-01,-0.440394,0.866025,-5.000000e-01,-0.897805,-0.440394,5.000000e-01,-8.660254e-01,-0.101168,-0.994869


## Logistic Regresssion model

In [27]:
model = LogisticRegression(C= 2.58069,
                           solver='liblinear',
 max_iter= 2,
penalty= 'l1',
 class_weight= 'balanced')

model_no_hyperparams = LogisticRegression()

cv_results = cross_validate(model, X_train_transformed, y, cv=5, scoring='precision')

precision = cv_results['test_score'].mean()

warnings.filterwarnings('ignore')


In [28]:
cv_results

{'fit_time': array([0.05884385, 0.03540301, 0.05247521, 0.04340196, 0.03587604]),
 'score_time': array([0.00842595, 0.00480223, 0.01079893, 0.00376892, 0.0036819 ]),
 'test_score': array([0.95571679, 0.97522961, 0.95062924, 0.97351367, 0.963241  ])}

In [29]:
precision

0.9636660612373207