In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
import warnings

In [3]:
org_data_df= pd.read_csv("../raw_data/churn_df_underbalanced.csv")

In [4]:
org_data_df = org_data_df.sample(frac=1, random_state=42)

In [5]:
org_data_df.head(2)

Unnamed: 0.1,Unnamed: 0,msno,is_churn,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,is_cancel,remaining_plan_duration,...,expire_year,expire_month_sin,expire_month_cos,expire_day_sin,expire_day_cos,registration_year,registration_month_sin,registration_month_cos,registration_day_sin,registration_day_cos
39371,35364,8OTTrmgne+Jb45ryTJXOv9IIZgiraVKD8QvbOsKVndE=,1,35.0,7.0,0.0,0.0,0.0,0.0,451,...,2018,0.5,-0.866025,-0.299363,-0.954139,2016,0.5,0.866025,0.571268,0.820763
20602,957230,0/LAmLtLKEnNkG3Xr8RfGvhjeNJ6EoHbfSKBzjbcqF0=,0,37.0,30.0,149.0,149.0,1.0,0.0,30,...,2017,0.866025,-0.5,-0.998717,-0.050649,2006,0.5,-0.866025,0.998717,-0.050649


In [6]:
data_df = org_data_df.drop(['Unnamed: 0','msno', 'bd', 'payment_method_id', 'city', 'registered_via'], axis=1)
data_df.is_churn.value_counts()
ids = org_data_df.msno

In [7]:
robust_features = ['remaining_plan_duration', 
          'usage_from_ltd',
              'payment_plan_days',
              'plan_list_price',
              'actual_amount_paid',
             'num_50',
              'num_75',
              'num_985',
              'expire_year',
              'last_transaction_year'
             ]
minmax_features = ['registration_year']
normal_features = [ 'num_25',
              
              'num_100',
              'num_unq',
              'total_secs',]

In [8]:
# Build the pipeline with the different steps
robust_pipeline = make_pipeline(RobustScaler())
minmax_pipeline = make_pipeline(MinMaxScaler())
log_pipeline = make_pipeline( 
    FunctionTransformer(np.log1p, validate=True),
    StandardScaler())

preprocessor_new = ColumnTransformer(
    transformers=[
        ('Robust', robust_pipeline, robust_features),
        ('MinMax', minmax_pipeline, minmax_features),
        ('Log', log_pipeline, normal_features)
    ], remainder='passthrough'
)

preprocessor_new

In [9]:
X = data_df.drop(['is_churn'], axis=1)
y= data_df['is_churn']

In [10]:
# Use fit on X_train and then transform both datasets
preprocessor_new.fit(X)
X_train_transformed = preprocessor_new.transform(X)

X_train_transformed = pd.DataFrame(
    X_train_transformed, 
    columns=X.columns
)

X_train_transformed.head(2)


Unnamed: 0,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,is_cancel,remaining_plan_duration,is_discount,num_25,num_50,num_75,...,expire_year,expire_month_sin,expire_month_cos,expire_day_sin,expire_day_cos,registration_year,registration_month_sin,registration_month_cos,registration_day_sin,registration_day_cos
0,16.153846,-0.264706,-23.0,-7.45,-3.040816,0.588235,0.545455,0.333333,1.0,0.0,...,-0.790776,-0.612106,0.5,-0.866025,-0.299363,-0.954139,0.5,0.866025,0.571268,0.820763
1,-0.038462,0.485294,0.0,0.0,0.0,-0.235294,0.0,-0.333333,0.0,0.0,...,-0.988468,0.151428,0.866025,-0.5,-0.998717,-0.050649,0.5,-0.866025,0.998717,-0.050649


In [11]:
model = LogisticRegression(C= 2.58069,
                           solver='liblinear',
 max_iter= 2,
penalty= 'l1',
 class_weight= 'balanced')

In [12]:
X_train, X_test, y_train, y_test, ids_train, ids_test = train_test_split(X_train_transformed, y, ids, test_size=0.2, stratify=y, random_state=42)

In [13]:
model.fit(X_train, y_train)



In [14]:
y_pred = model.predict_proba(X_test)
# y_pred = pd.DataFrame(y_pred)
y_pred= y_pred[:,1]


In [16]:
# from sklearn.metrics import accuracy_score
# accuracy_score(y_test, y_pred)

In [17]:
pd.DataFrame({'id': ids_test, 'prediction': y_pred, 'check': y_test})

Unnamed: 0,id,prediction,check
29784,RimQ2deT5cLaSJpBXGxsfUFQgrKDtTpMNR+LI/gCP+E=,0.362682,1
54227,p2jsQmpCzKg89Rmyj+mskzeDYgp6Vs5iDP+oNb32hDk=,0.253484,1
33069,OiC+Sb7HwFPLg5knh0+FoYKeHPca9UsARMCRh4S9yaM=,0.917324,1
10646,oiU/Xn0b7SqMG/rkKIbc/iL6HwCK8YL2hNtbtb9nMhs=,0.168704,0
11161,itxTVsNqGnQV4e/pJHDWfdq1tWJfecwbrcj+3nW4T5s=,0.248096,0
...,...,...,...
22469,ewx/ju9oF+xtaX+7pR36o2rYdMEeGtx2/4LkpIomJBE=,0.202245,0
11980,4RGMZZ15vO/I4qlI2rAlsa2a1JH7hovrDc6iZJvR5BI=,0.264740,0
45499,JCodGh7XZXEbkGE24za+0qaDc8oDGouMrfxicf/R6NM=,0.389913,1
42462,PfynSfWrNQ7FQBvhmLscWk6t/q+CWmAemWnQlhbXQ1Q=,0.614611,1


In [18]:
X_tr, X_te, y_tr, y_te, ids_tra, ids_te = train_test_split(X,y, ids, test_size=0.2, stratify=y, random_state=42)

In [19]:
merged = X_te.merge(ids_te, left_index=True, right_index=True)
merged

Unnamed: 0,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,is_cancel,remaining_plan_duration,is_discount,num_25,num_50,num_75,...,expire_month_sin,expire_month_cos,expire_day_sin,expire_day_cos,registration_year,registration_month_sin,registration_month_cos,registration_day_sin,registration_day_cos,msno
29784,30.0,99.0,99.0,1.0,0.0,72,0,74.0,13.0,9.0,...,0.500000,-0.866025,-0.848644,0.528964,2013,-8.660254e-01,5.000000e-01,0.485302,-0.874347,RimQ2deT5cLaSJpBXGxsfUFQgrKDtTpMNR+LI/gCP+E=
54227,30.0,149.0,149.0,1.0,0.0,35,0,24.0,9.0,7.0,...,0.866025,-0.500000,-0.485302,-0.874347,2010,8.660254e-01,-5.000000e-01,0.968077,-0.250653,p2jsQmpCzKg89Rmyj+mskzeDYgp6Vs5iDP+oNb32hDk=
33069,30.0,149.0,149.0,1.0,1.0,41,0,35.0,13.0,10.0,...,0.866025,-0.500000,-0.848644,0.528964,2016,8.660254e-01,5.000000e-01,-0.968077,-0.250653,OiC+Sb7HwFPLg5knh0+FoYKeHPca9UsARMCRh4S9yaM=
10646,30.0,99.0,99.0,1.0,0.0,31,0,0.0,0.0,0.0,...,0.866025,-0.500000,-0.968077,-0.250653,2016,-1.000000e+00,-1.836970e-16,-0.998717,-0.050649,oiU/Xn0b7SqMG/rkKIbc/iL6HwCK8YL2hNtbtb9nMhs=
11161,30.0,129.0,129.0,1.0,0.0,31,0,6.0,3.0,7.0,...,0.866025,-0.500000,0.848644,0.528964,2013,-1.000000e+00,-1.836970e-16,-0.998717,-0.050649,itxTVsNqGnQV4e/pJHDWfdq1tWJfecwbrcj+3nW4T5s=
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22469,30.0,99.0,99.0,1.0,0.0,31,0,15.0,10.0,8.0,...,0.866025,-0.500000,-0.897805,-0.440394,2016,5.000000e-01,8.660254e-01,-0.897805,-0.440394,ewx/ju9oF+xtaX+7pR36o2rYdMEeGtx2/4LkpIomJBE=
11980,30.0,180.0,180.0,1.0,0.0,43,0,0.0,0.0,0.0,...,0.500000,-0.866025,0.651372,-0.758758,2013,-5.000000e-01,8.660254e-01,0.848644,0.528964,4RGMZZ15vO/I4qlI2rAlsa2a1JH7hovrDc6iZJvR5BI=
45499,30.0,149.0,149.0,1.0,0.0,68,0,55.0,9.0,6.0,...,0.500000,-0.866025,-0.299363,-0.954139,2010,5.000000e-01,8.660254e-01,0.651372,-0.758758,JCodGh7XZXEbkGE24za+0qaDc8oDGouMrfxicf/R6NM=
42462,30.0,149.0,149.0,1.0,0.0,126,0,16.0,10.0,5.0,...,-0.500000,-0.866025,-0.937752,0.347305,2015,1.224647e-16,-1.000000e+00,-0.848644,0.528964,PfynSfWrNQ7FQBvhmLscWk6t/q+CWmAemWnQlhbXQ1Q=


In [20]:
to_test = merged.head(10)
to_test.to_csv("to_test3.csv")

to_test

Unnamed: 0,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,is_cancel,remaining_plan_duration,is_discount,num_25,num_50,num_75,...,expire_month_sin,expire_month_cos,expire_day_sin,expire_day_cos,registration_year,registration_month_sin,registration_month_cos,registration_day_sin,registration_day_cos,msno
29784,30.0,99.0,99.0,1.0,0.0,72,0,74.0,13.0,9.0,...,0.5,-0.866025,-0.848644,0.528964,2013,-0.866025,0.5,0.485302,-0.874347,RimQ2deT5cLaSJpBXGxsfUFQgrKDtTpMNR+LI/gCP+E=
54227,30.0,149.0,149.0,1.0,0.0,35,0,24.0,9.0,7.0,...,0.866025,-0.5,-0.485302,-0.874347,2010,0.866025,-0.5,0.968077,-0.250653,p2jsQmpCzKg89Rmyj+mskzeDYgp6Vs5iDP+oNb32hDk=
33069,30.0,149.0,149.0,1.0,1.0,41,0,35.0,13.0,10.0,...,0.866025,-0.5,-0.848644,0.528964,2016,0.866025,0.5,-0.968077,-0.250653,OiC+Sb7HwFPLg5knh0+FoYKeHPca9UsARMCRh4S9yaM=
10646,30.0,99.0,99.0,1.0,0.0,31,0,0.0,0.0,0.0,...,0.866025,-0.5,-0.968077,-0.250653,2016,-1.0,-1.83697e-16,-0.998717,-0.050649,oiU/Xn0b7SqMG/rkKIbc/iL6HwCK8YL2hNtbtb9nMhs=
11161,30.0,129.0,129.0,1.0,0.0,31,0,6.0,3.0,7.0,...,0.866025,-0.5,0.848644,0.528964,2013,-1.0,-1.83697e-16,-0.998717,-0.050649,itxTVsNqGnQV4e/pJHDWfdq1tWJfecwbrcj+3nW4T5s=
19168,30.0,149.0,149.0,1.0,0.0,30,0,321.0,64.0,181.0,...,0.866025,-0.5,0.897805,-0.440394,2015,-1.0,-1.83697e-16,0.724793,0.688967,de5YnJjPfHjsKnGhuTLBxBEdckqIlgNwQ62w+AZh64I=
3672,30.0,99.0,99.0,1.0,0.0,31,0,41.0,3.0,3.0,...,0.866025,-0.5,-0.897805,-0.440394,2014,-0.866025,-0.5,-0.394356,0.918958,/PtkirFI6zuradaatMBakk5UosmB6WFRf3PpbwGVTzc=
8338,30.0,149.0,149.0,1.0,0.0,30,0,35.0,2.0,2.0,...,0.866025,-0.5,-0.201299,0.97953,2004,1.0,6.123234000000001e-17,-0.394356,0.918958,uGMY46v47o+v/c1f36ao2NUimSnODMJKncV3Ca4OvU8=
37743,195.0,894.0,894.0,0.0,0.0,195,0,9.0,2.0,3.0,...,-0.866025,0.5,0.394356,0.918958,2013,-1.0,-1.83697e-16,0.201299,0.97953,yYxkPKBEXyqK7KWbBo0OTy8k/UJWmHPoOmYvVI4bnK8=
21082,30.0,180.0,180.0,1.0,0.0,30,0,82.0,41.0,29.0,...,0.866025,-0.5,0.724793,0.688967,2012,-0.866025,-0.5,-0.988468,0.151428,rFNMExdWBTFyUi8nOh0d7rT0RaBBRpORZFF/m9o9MJo=


In [21]:
import joblib

In [22]:
package = {
    'model': model,
    'preprocessor': preprocessor_new  
}

joblib.dump(package, 'package.pkl')

['package.pkl']