In [25]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import pandas as pd
from datetime import datetime, timedelta
import numpy as np

In [26]:
train_df = pd.read_csv("data/training_data.csv")

In [27]:
def get_age(dob):
    diff = datetime.now()-dob
    return diff.days/365

In [28]:
train_df["DOB"] = pd.to_datetime(train_df["DOB"])
train_df["age"] = train_df["DOB"].apply(get_age)

In [29]:
train_df.drop(columns = ["DOB"], inplace = True)

In [30]:
train_df["Start_Date"] = pd.to_datetime(train_df["Start_Date"])
train_df["Validity_End"] = pd.to_datetime(train_df["Validity_End"])

In [31]:
def get_le_dict(le):
    classes = list(le.classes_)
    labels = [le.transform(np.array([c])) for c in classes]
    ret_dict ={k:v[0] for k,v in zip(classes, labels)} 
    print(ret_dict)

In [32]:
le = LabelEncoder()
train_df["Contact_type"] = le.fit_transform(train_df["Contact_type"])

In [33]:
get_le_dict(le)

{'Email': 0, 'Phone': 1, 'Telegram': 2, 'WhatsApp': 3}


In [34]:
le = LabelEncoder()
train_df["Gender"] = le.fit_transform(train_df["Gender"])

In [35]:
get_le_dict(le)

{'Agender': 0, 'Bigender': 1, 'Female': 2, 'Genderfluid': 3, 'Genderqueer': 4, 'Male': 5, 'Non-binary': 6, 'Polygender': 7}


In [36]:
train_df['Pack_ID'] = train_df['Pack_ID'].astype(int)

In [37]:
le = LabelEncoder()
train_df["Status"] = le.fit_transform(train_df["Status"])

In [38]:
get_le_dict(le)

{'ACTIVE': 0, 'INACTIVE': 1}


In [39]:
le = LabelEncoder()
train_df["State"] = le.fit_transform(train_df["State"])

In [40]:
get_le_dict(le)

{'Andhra Pradesh': 0, 'Arunachal Pradesh': 1, 'Assam': 2, 'Bihar': 3, 'Chhattisgarh': 4, 'Goa': 5, 'Gujarat': 6, 'Haryana': 7, 'Himachal Pradesh': 8, 'Jharkhand': 9, 'Karnataka': 10, 'Kerala': 11, 'Madhya Pradesh': 12, 'Maharashtra': 13, 'Manipur': 14, 'Meghalaya': 15, 'Mizoram': 16, 'Nagaland': 17, 'Odisha': 18, 'Punjab': 19, 'Rajasthan': 20, 'Sikkim': 21, 'Tamil Nadu': 22, 'Telangana': 23, 'Tripura': 24, 'Uttar Pradesh': 25, 'Uttarakhand': 26, 'West Bengal': 27}


In [41]:
le = LabelEncoder()
train_df["Billing Type"] = le.fit_transform(train_df["Billing Type"])

In [42]:
get_le_dict(le)

{'NetBanking': 0, 'Paytm': 1}


In [43]:
le = LabelEncoder()
train_df["Upgradable"] = le.fit_transform(train_df["Upgradable"])

In [44]:
get_le_dict(le)

{'NO': 0, 'YES': 1}


In [45]:
le = LabelEncoder()
train_df["freq_used_dev"] = le.fit_transform(train_df["freq_used_dev"])

In [46]:
get_le_dict(le)

{'Android Phone': 0, 'Android TV': 1, 'Apple TV': 2, 'FireTV': 3, 'PC/Laptop': 4, 'iPad': 5, 'iPhone': 6}


In [47]:
train_df.drop(columns = ["Country"], inplace = True)

In [48]:
churn_df = pd.read_csv("data/churn.csv")

In [49]:
train_df = train_df.merge(churn_df, on = "profile_ID", how = "left")

In [50]:
cols2train = ['Contact_type', 'Gender', 'Pack_ID',
       'Status', 'Upgradable', 'State', 'avg_dur', 'num_devs',
       'freq_used_dev', 'Subscription Type', 'Billing Type',
       'Customer Longevity', 'age' 
]

In [51]:
X = train_df[cols2train]
y = train_df["churn"]

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7)

In [53]:
lr_mod = LogisticRegression()

In [54]:
X_train

Unnamed: 0,Contact_type,Gender,Pack_ID,Status,Upgradable,State,avg_dur,num_devs,freq_used_dev,Subscription Type,Billing Type,Customer Longevity,age
335668,3,2,432802,0,0,1,1518.000000,30,4,12,1,4,20.347945
294643,3,2,432813,1,1,20,1344.454545,22,4,12,1,4,30.734247
109481,1,5,432866,1,1,21,1114.187500,16,4,3,0,2,31.005479
152127,1,7,432231,1,0,4,1326.950000,20,3,1,1,1,21.282192
194920,2,5,432458,0,0,6,1045.950000,20,3,12,1,4,1.068493
...,...,...,...,...,...,...,...,...,...,...,...,...,...
246778,2,5,432486,1,1,1,1507.173913,23,3,1,1,1,7.654795
244882,2,2,432674,1,1,8,795.560000,25,3,6,0,3,29.219178
257937,3,5,432574,1,1,11,1179.450000,20,4,3,0,2,25.942466
97806,1,2,432588,1,0,11,1125.700000,20,4,12,1,4,30.191781


In [55]:
lr_mod.fit(X_train, y_train)

LogisticRegression()

In [56]:
lr_mod.score(X_test, y_test)

0.7989219290336005

In [57]:
import pickle

In [58]:
pickle.dump(lr_mod, open('lr_mod.p', 'wb'))

In [34]:
import numpy as np

In [35]:
X_test1 = np.vstack(X_test)

In [48]:
X_test.reset_index(inplace = True, drop = True)

In [51]:
lr_mod.predict_proba(X_test.loc[0,:].values.reshape(1,-1))[0]

array([0.73531531, 0.26468469])