In [2]:
import pandas as pd
import numpy as np 
import scipy.stats as stats

In [3]:
FILE_PATH = "../data/Churn.csv"

In [4]:
df = pd.read_csv(FILE_PATH)
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [62]:
df.columns

Index(['state', 'account length', 'area code', 'phone number',
       'international plan', 'voice mail plan', 'number vmail messages',
       'total day minutes', 'total day calls', 'total day charge',
       'total eve minutes', 'total eve calls', 'total eve charge',
       'total night minutes', 'total night calls', 'total night charge',
       'total intl minutes', 'total intl calls', 'total intl charge',
       'customer service calls', 'churn'],
      dtype='object')

In [6]:
call_mean = df['total day calls'].mean()
call_median = df['total day calls'].median()
call_std = df['total day calls'].std()
print(f'Total Day calls mean: {np.round(call_mean, 3)}')
print(f'Total Day calls median: {np.round(call_median, 3)}')
print(f'Total Day calls standard deviation: {np.round(call_std, 3)}')

Total Day calls mean: 100.436
Total Day calls median: 101.0
Total Day calls standard deviation: 20.069


In [12]:
"Churn rate is " + str(np.round(sum(df['churn'])/len(df)*100, 3)) + "%"

'Churn rate is 14.491%'

In [13]:
df.groupby('international plan')['customer service calls'].mean()

international plan
no     1.573422
yes    1.464396
Name: customer service calls, dtype: float64

No significant difference between the average number of phone calls if they have internation plan or not.

In [31]:
print("Churn rate with voice mail plan", str(np.round(len(df[(df['voice mail plan']=='yes') & (df['churn']==True)]) / len(df)*100, 2)), "%")
print("Churn rate without voice mail plan", str(np.round(len(df[(df['voice mail plan']=='no') & (df['churn']==True)]) / len(df)*100, 2)), "%")

Churn rate with voice mail plan 2.4 %
Churn rate without voice mail plan 12.09 %


In [32]:
df[['total day minutes', 'total day charge']].corr()

Unnamed: 0,total day minutes,total day charge
total day minutes,1.0,1.0
total day charge,1.0,1.0


Total day minutes and total day charge are perfectely and positively correlated.

In [60]:
df.groupby(['state'])['churn'].apply(lambda x: str(np.round(np.sum(x)/len(df)*100, 4)) + "%").sort_values(ascending=False).head()

state
NJ    0.5401%
TX    0.5401%
MD    0.5101%
MI      0.48%
MN      0.45%
Name: churn, dtype: object

New Jersey and Texas both have the same churn rate.

In [61]:
df.groupby(['state'])['churn'].apply(lambda x: str(np.round(np.sum(x)/len(df)*100, 4)) + "%").sort_values(ascending=False).tail()

state
LA    0.12%
AZ    0.12%
HI    0.09%
IA    0.09%
AK    0.09%
Name: churn, dtype: object

Alaska, Hawaii, and Iowa have lowest churn rates.

In [63]:
df[['customer service calls', 'churn']].corr()

Unnamed: 0,customer service calls,churn
customer service calls,1.0,0.20875
churn,0.20875,1.0


There is a very weak positive relationship between these two independent variables. 

In [64]:
df.groupby("churn")['customer service calls'].mean()

churn
False    1.449825
True     2.229814
Name: customer service calls, dtype: float64

In [66]:
df['total eve calls'].max() - df['total eve calls'].min()

170

In [87]:
q3 = np.quantile(df['total day minutes'], 0.75)
"Churn rate of the top 25% of high usage customers: "+str(np.round(df[df['total day minutes']>q3]['churn'].sum()/(len(df))*100, 3))+'%'

'Churn rate of the top 25% of high usage customers: 7.321%'

In [90]:
df.groupby('churn')['total intl minutes'].mean() #.diff to calc difference

churn
False    10.158877
True     10.700000
Name: total intl minutes, dtype: float64

Only a 0.541 minute distance. Not significant difference.