In [None]:
# This Python 3environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [None]:
#Can't see all the info
df.head().T

### Uniformizing the column names and categorical data

In [None]:
df.columns=df.columns.str.lower().str.replace(' ','_')

cate_columns=list(df.dtypes[df.dtypes=='object'].index)
for c in cate_columns:
    df[c]=df[c].str.lower().str.replace(' ','_')

In [None]:
df.head().T

In [None]:
df.dtypes

### Objective data supposed to be numerical, change all the nonnumerical data into 0

In [None]:
#totalcharges column has object instead of number
#pd.to_numeric(df.totalcharges) #convert all to numbers
df.totalcharges=pd.to_numeric(df.totalcharges,errors='coerce')

In [None]:
#df[tc.isnull()][['customerid','totalcharges']]

In [None]:
df.totalcharges=df.totalcharges.fillna(0)

### Change yes/no into 1/0

In [None]:
(df.churn=='yes').astype(int).head()

In [None]:
df.churn=(df.churn=='yes').astype(int)

### Split data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_full_train,df_test=train_test_split(df,test_size=0.2,random_state=1)
df_train,df_val=train_test_split(df_full_train,test_size=0.25,random_state=1)
df_train=df_train.reset_index(drop=True)
df_val=df_val.reset_index(drop=True)
df_test=df_test.reset_index(drop=True)
y_train=df_train.churn.values
y_val=df_val.churn.values
y_test=df_test.churn.values
del df_train['churn']
del df_val['churn']
del df_test['churn']

### Check if there is missing values

In [None]:
df_full_train=df_full_train.reset_index(drop=True)
df_full_train.isnull().sum()

### Analyse of target values

In [None]:
#df_full_train.churn.value_counts(normalize=True)
#churn rate is 26%
global_churn_rate=df_full_train.churn.mean()

In [None]:
numerical=['tenure','monthlycharges','totalcharges']

In [None]:
df_full_train.columns

In [None]:
categorical=['gender', 'seniorcitizen', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

In [None]:
df_full_train[categorical].nunique()

In [None]:
churn_F=df_full_train[df_full_train.gender=='female'].churn.mean()
churn_F

In [None]:
churn_M=df_full_train[df_full_train.gender=='male'].churn.mean()
churn_M

In [None]:
global_churn_rate

In [None]:
churn_partner=df_full_train[df_full_train.partner=='yes'].churn.mean()
churn_partner

In [None]:
churn_no_partner=df_full_train[df_full_train.partner=='no'].churn.mean()
churn_no_partner

In [None]:
global_churn_rate-churn_F

In [None]:
global_churn_rate-churn_partner

Seems like partner is more important than female/male

In [None]:
churn_no_partner/global_churn_rate

1. difference=global-group>0 means less likely to churn
2. risk ratio=group/global>1 mean more likely to churn





In [None]:
df_group=df_full_train.groupby('gender').churn.agg(['mean','count'])
df_group['diff']=df_group['mean']-global_churn_rate
df_group['risk']=df_group['mean']/global_churn_rate
df_group

### Analysis of churn rate (who is more likely to churn) within groups

In [None]:
from IPython.display import display
for c in categorical:
    print(c)
    df_group=df_full_train.groupby(c).churn.agg(['mean','count'])
    df_group['diff']=df_group['mean']-global_churn_rate
    df_group['risk']=df_group['mean']/global_churn_rate
    display(df_group)
    print()

## Mutual information

In [None]:
from sklearn.metrics import mutual_info_score

In [None]:
mutual_info_score(df_full_train.contract,df_full_train.churn)

In [None]:
mutual_info_score(df_full_train.gender,df_full_train.churn)

In [None]:
def mutual_info_churn_score(series):
    return mutual_info_score(series,df_full_train.churn)

In [None]:
mi=df_full_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

## Correlation coefficient

In [None]:
df_full_train[numerical].corrwith(df_full_train.churn)

In [None]:
df_full_train[(df_full_train.tenure>2) & (df_full_train.tenure<=12)].churn.mean()

In [None]:
df_full_train[df_full_train.tenure<=2].churn.mean()

In [None]:
df_full_train[df_full_train.tenure>12].churn.mean()

## One-hot encoding 

In [None]:
from sklearn.feature_extraction import DictVectorizer
train_dicts=df_train[categorical].to_dict(orient='records')
dv=DictVectorizer(sparse=False)
dv.fit(train_dicts)
X_train_cat=dv.transform(train_dicts)
X_train_num=df_train[numerical].to_numpy()
X_train=np.concatenate((X_train_cat,X_train_num),axis=1)
val_dicts=train_dicts=df_val[categorical].to_dict(orient='records')
X_val_cat=dv.transform(val_dicts)
X_val_num=df_val[numerical].to_numpy()
X_val=np.concatenate((X_val_cat,X_val_num),axis=1)

In [None]:
dv.get_feature_names()

## Logistic Regression

In [None]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

In [None]:
z=np.linspace(-5,5,51)
plt.plot(z,sigmoid(z))

In [None]:
def logistic_regression(xi):
    result=w0
    
    for j in range(len(w)):
        score=score+xi[j]*w[j]
        
    result=sigmoid(score)
    return result

In [None]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(X_train,y_train)

In [None]:
model.coef_[0].round(3)

In [None]:
model.intercept_[0]

In [None]:
model.predict(X_train)

In [None]:
y_pred=model.predict_proba(X_val)[:,1]
churn_decision=(y_pred>=0.5)
y_val==churn_decision
df_pred=pd.DataFrame()
df_pred['probability']=y_pred
df_pred['prediction']=churn_decision.astype(int)
df_pred['actual']=y_val
df_pred['correct']=df_pred.prediction==df_pred.actual
df_pred.correct.mean()

In [None]:
df_pred

In [None]:
dict(zip(dv.get_feature_names(),model.coef_[0].round(3)))

In [None]:
small=['contract','tenure','monthlycharges']

In [None]:
dicts_train_small=df_train[small].to_dict(orient='records')
dicts_val_small=df_val[small].to_dict(orient='records')

In [None]:
dv_small=DictVectorizer(sparse=False)
dv_small.fit(dicts_train_small)

In [None]:
dv_small.get_feature_names()

In [None]:
X_train_small=dv_small.transform(dicts_train_small)

In [None]:
model_small=LogisticRegression()
model_small.fit(X_train_small,y_train)

In [None]:
w0=model_small.intercept_[0]
w0

In [None]:
w1=model_small.coef_[0]
w1.round(3)

In [None]:
dict(zip(dv_small.get_feature_names(),model_small.coef_[0].round(3)))

In [None]:
y_train.dtype

In [None]:
X_train.dtype

In [None]:
model=LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
#model= LogisticRegression(solver='liblinear')
model.fit(X_train,y_train)

In [None]:
y_pred=model.predict_proba(X_val)[:,1]