Data available for download from Kaggle: https://www.kaggle.com/dimitaryanev/mobilechurndataxlsx

In [1]:
import pandas as pd

# Converted to TSV for faster load times, if using the link above, use read_excel()
f_path = "data/mobile-churn-data.tsv"
df = pd.read_csv(f_path, sep='\t')

In [2]:
# Get rid of P.I.D. for privacy and lack of predictive value, year is all the same, so no helpful info
df = df.drop(['user_account_id', 'year'], axis=1)

In [3]:
df.head()

Unnamed: 0,month,user_lifetime,user_intake,user_no_outgoing_activity_in_days,user_account_balance_last,user_spendings,user_has_outgoing_calls,user_has_outgoing_sms,user_use_gprs,user_does_reload,...,last_100_calls_outgoing_duration,last_100_calls_outgoing_to_onnet_duration,last_100_calls_outgoing_to_offnet_duration,last_100_calls_outgoing_to_abroad_duration,last_100_sms_outgoing_count,last_100_sms_outgoing_to_onnet_count,last_100_sms_outgoing_to_offnet_count,last_100_sms_outgoing_to_abroad_count,last_100_gprs_usage,churn
0,6,1000,0,1,0.05,0.0,1,1,0,0,...,75.27,0.0,63.43,0.0,210,1,84,0,0.0,0
1,6,1000,0,25,28.31,3.45,1,0,0,0,...,13.38,11.18,2.0,11.18,0,0,0,0,0.0,0
2,6,1005,0,8,15.62,1.97,1,0,0,0,...,30.0,0.0,0.0,10.45,0,0,0,0,0.0,0
3,6,1013,0,11,5.62,0.0,1,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0,0,0.0,1
4,6,1032,0,2,5.86,0.15,1,0,0,1,...,2.58,0.0,1.0,0.0,0,0,0,0,0.0,0


In [4]:
X = df.drop('churn', axis=1)
y = df[['churn']]

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1)

In [16]:
def get_shape(clean=False):
    if not clean:
        iters = [X_train, y_train, X_test, y_test]
    else:
        iters = [X_train, X_train_clean, y_train, X_test, X_test_clean, y_test]
    for x in iters:
        print(x.shape)

            
get_shape()

(59822, 63)
(59822, 1)
(6647, 63)
(6647, 1)


In [19]:
def remove_corr_cols(data, target=['user_lifetime']):
    """Remove columns that share the same information with the target column."""
    target = ['user_lifetime']
    sorted_corr = X_train.corr()[target].sort_values(target, ascending=False)
    removing_features = sorted_corr[sorted_corr.duplicated()].index
    return data.drop(removing_features, axis=1)


def process_df(data, scale=None):
    #Remove input features with same information
    data = remove_corr_cols(data, True)
    # Make a long col name shorter & more intuitive
    data = data.rename(columns={'user_no_outgoing_activity_in_days': 'min_outgoing_inactive_days'})
    cols = data.columns
    
    if not scale:
        scale = StandardScaler()
        scale.fit(data)

    return pd.DataFrame(scale.transform(data), columns=cols), scale

In [20]:
X_train_clean, scale = process_df(X_train)
X_test_clean, scale = process_df(X_test, scale)

get_shape(True)

(59822, 63)
(59822, 57)
(59822, 1)
(6647, 63)
(6647, 57)
(6647, 1)


In [21]:
# Uncomment the next line if you need to install SMOTE aka imbalanced-learn
# conda install -c conda-forge imbalanced-learn

In [24]:
from imblearn.over_sampling import SMOTE

In [25]:
os = SMOTE()
cols = X_train_clean.columns

In [36]:
os_data_X, os_data_y = os.fit_sample(X_train_clean, y_train)
os_data = pd.DataFrame(data=os_data_X, columns=cols)
os_data['churn'] = os_data_y

In [37]:
os_data_X.columns

Index(['month', 'user_lifetime', 'user_intake', 'min_outgoing_inactive_days',
       'user_account_balance_last', 'user_spendings',
       'user_has_outgoing_calls', 'user_has_outgoing_sms', 'user_use_gprs',
       'user_does_reload', 'reloads_inactive_days', 'reloads_count',
       'reloads_sum', 'calls_outgoing_count', 'calls_outgoing_spendings',
       'calls_outgoing_duration', 'calls_outgoing_spendings_max',
       'calls_outgoing_duration_max', 'calls_outgoing_inactive_days',
       'calls_outgoing_to_onnet_count', 'calls_outgoing_to_onnet_spendings',
       'calls_outgoing_to_onnet_duration', 'calls_outgoing_to_offnet_count',
       'calls_outgoing_to_offnet_spendings',
       'calls_outgoing_to_offnet_duration', 'calls_outgoing_to_abroad_count',
       'calls_outgoing_to_abroad_spendings',
       'calls_outgoing_to_abroad_duration', 'sms_outgoing_count',
       'sms_outgoing_spendings', 'sms_outgoing_spendings_max',
       'sms_outgoing_inactive_days', 'sms_outgoing_to_onnet_co

In [31]:
os_data_X.shape

(94714, 58)

In [27]:
print("Length of oversampled data is                                            ",len(os_data_X))
print("Number of churn whose value is 0 in oversampled data                     ",len(os_data_y[os_data.churn==0]))
print("Number of churn whose value is 1 in oversampled data in oversampled data ",len(os_data_y[os_data.churn==1]))

Length of oversampled data is                                             94714
Number of churn whose value is 0 in oversampled data                      47357
Number of churn whose value is 1 in oversampled data in oversampled data  47357


In [42]:
len(X_test_clean)

6647

In [43]:
len(y_test['churn'])

6647

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

logreg = LogisticRegression(max_iter=200)

# y requires a series, not a dataframe

logreg.fit(X_train_clean, y_train['churn'])
X_test_preds = logreg.predict(X_test_clean)



In [38]:
# The above is accuracy without SMOTE, now let's try it with

logreg.fit(os_data_X.drop('churn', axis=1), os_data_y['churn'])
logreg.score(X_test_clean, y_test['churn'])

0.8247329622386039

In [None]:
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
rfe = RFE(logreg, 40)
rfe = rfe.fit(os_data_X, os_data_y['churn'])

print(rfe.support_)
print(rfe.ranking_)