In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
data = pd.read_csv('Telco_Cusomer_Churn.csv')
data.head()

## Basic understanding of data

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.duplicated().sum()

In [None]:
df = data.copy()

In [None]:
df['TotalCharges'].unique()

In [None]:
space_index = df[df['TotalCharges'].str.contains(' ')].index

In [None]:
for i in space_index:
    df.loc[i, 'TotalCharges'] = 0

In [None]:
## df['TotalCharges'].replace(to_replace=' ',value= '0', inplace = True)

In [None]:
df['TotalCharges'].str.contains(' ').sum()

In [None]:
df['TotalCharges'] = df['TotalCharges'].astype(float)

In [None]:
df.info()

In [None]:
for i in df.columns:
    print(i)
    print(df[i].unique())
    print('--------')

In [None]:
df['SeniorCitizen'].replace({1:'Yes',0:'No'}, inplace=True)

In [None]:
df.drop('customerID', axis=1, inplace=True)

### Seperating numerical and catgorical data

In [None]:
num = df.select_dtypes(include=np.number).columns.to_list()
cat = df.select_dtypes(include=object).columns.to_list()

### fetch outliers using IQR

In [None]:
Q1 = df[num].quantile(0.25)
Q3 = df[num].quantile(0.75)

In [None]:
IQR = Q3 - Q1
IQR

In [None]:
lower_whis  = Q1 - (1.5*IQR)
lower_whis

In [None]:
upper_whis = Q3 + (1.5*IQR)
upper_whis

In [None]:
outliers = df[((df[num] <lower_whis) | (df[num] > upper_whis)).any(axis=1)]
non_outliers = df[~((df[num] <lower_whis) | (df[num] > upper_whis)).any(axis=1)]

In [None]:
len(outliers)/len(df)*100

In [None]:
plt.rcParams['figure.figsize'] = [20, 10]

In [None]:
t = 1
for i in num:
    plt.subplot(1,3,t)
    sns.boxplot(df[i])
    plt.title(f'Boxplot of {i}')
    t +=1
plt.tight_layout()
plt.show()

In [None]:
df.describe()

In [None]:
print('tenure-------------',df['tenure'].skew(),'\nMonthlyCharges-----', df['MonthlyCharges'].skew(),'\nTotalCharges-------', df['TotalCharges'].skew())

In [None]:
df[cat].describe().T

<font size = 4px color = #00FF00>Inference</font>
* There are nearly 7000 rows with 21 columns
* Dropped customer ID for analysis
* Treated blanks in totalcharges with ), as their respective tenure is 0
* converted pre encoded senior citizen column(0/1) to categiric column
* No evident outliers and no null values
* the numeic columns are seems to be skewed

<font color = #00FFFF size = 6px><b>Deep Analysis</font>

### Univariate Analysis

In [None]:
t = 1
for i in num:
    plt.subplot(3,3,t)
    sns.histplot(df[i],color = 'green')
    plt.title(f'Histplot of {i}')
    t +=1
plt.tight_layout()
plt.show()

In [None]:
t = 1
for i in num:
    plt.subplot(3,3,t)
    sns.distplot(df[i],color = 'red')
    plt.title(f'Distplot of {i}')
    t +=1
plt.tight_layout()
plt.show()

In [None]:
t = 1
for i in num:
    plt.subplot(3,3,t)
    sns.boxplot(df[i],color = 'yellow')
    plt.title(f'Boxplot of {i}')
    t +=1
plt.tight_layout()
plt.show()

In [None]:
t = 1
for i in num:
    plt.subplot(3,3,t)
    sns.kdeplot(df[i],color = 'blue')
    plt.title(f'KDEplot of {i}')
    t +=1
plt.tight_layout()
plt.show()

In [None]:
t = 1
for i in num:
    plt.subplot(1,3,t)
    sns.violinplot(df[i],color = '#5F9EA0')
    plt.title(f'Violinplot of {i}')
    t +=1
plt.tight_layout()
plt.show()

<font size = 4px color = #00FF00>Inference</font>
* Tenure --> Slightly right skewed and many customers with tenure value between 0 to 10
* MonthlyCharges --> Slightly left skewed and many customers with tenure value between 20 to 30

In [None]:
t = 1
with plt.rc_context({'font.size': 10, 'figure.figsize': (25, 20)}):
    for i in cat:
        plt.subplot(5,4,t)
        df[i].value_counts().plot(kind='pie',autopct = '%.1f%%')
        plt.title(f'Piechart of {i}')
        t+=1
plt.tight_layout()
plt.show()

In [None]:
t = 1
for i in cat:
    plt.subplot(5,4,t)
    sns.countplot(x=df[i], palette='plasma') 
    plt.title(f'Count plot of {i}')
    t+=1
plt.tight_layout()
plt.show()

### Bivariate Analysis

#### Num V/s Num

In [None]:
df[num].corr()

In [None]:
t = 1
for i in num:
    for j in num:
        if i!=j: # to ignore comparing column to itself
            plt.subplot(2,3,t)
            sns.scatterplot(x=df[i], y = df[j])
            t+=1
plt.tight_layout()
plt.show()

#### Num V/s cat

In [None]:
t = 1
for i in num:
    print(f'Mean {i} for churn column')
    print(df.groupby('Churn')[i].mean())
    print('----------------------------')

In [None]:
t = 1
for i in num:
    plt.subplot(1,3,t)
    sns.barplot(x = df['Churn'], y=df[i])
    t+=1
plt.tight_layout()
plt.show()

#### Cat v/s cat (churn)

In [None]:
for i in cat:
    print(f'frequency summary between churn and {i} ')
    print(pd.crosstab(df['Churn'], df[i]))
    print('---------------------------------')

In [None]:
t = 1
for i in cat:
    if i != 'Churn':
        plt.subplot(6,3,t)
        sns.countplot(x = df['Churn'], hue = df[i]).plot(kind='bar')
        plt.title(f'count of Churn v/s {i} ')
        t+=1
plt.tight_layout()
plt.show()

### Multivariate Analysis

In [None]:
sns.heatmap(df[num].corr(), annot = True)
#cmap = 'plasma','magma','viridis','Greens','Blues','Oranges','coolwarm','cividis'
plt.show()

In [None]:
sns.pairplot(df[num], diag_kind='kde')
plt.show()

<font color='red' size =4px<b><i> Statistical Inference</f>

***1.Check whether gender has any dependency on churn***

In [None]:
f_obs = pd.crosstab(df['Churn'],df['gender'])
f_obs

In [None]:
from scipy import stats

In [None]:
chi_stats,p_value,dof,f_exp =stats.chi2_contingency(f_obs)
print('Chi stats:"',chi_stats)
print('p value:"',p_value)

In [None]:
chi_crit = round(stats.chi2.isf(0.05,1),3)
chi_crit

***Conclusion***
* p_value > sig value
* stats < crit value --->>>> Fails to reject Ho
* ----:::::::: Gender and churn are independent

***2.Check whether tenure is statistically significant in predicting churn***

In [None]:
yes = df[df['Churn']=='Yes']['tenure']
no = df[df['Churn']=='No']['tenure']

* Ho: Tenure is not statistically significant in predicting churn
* Ha: tenure is statistically significant in predicting churn
----------------------------------------------------
* Ho: tenure and churn are independent
* Ha: tenure and churn are dependent
------------------------------------------------------
* Ho: Means are equal
* Ha: Means are not equal


****Assume data is normal and of equal varience****

In [None]:
stats.ttest_ind(yes,no)

In [None]:
stats.t.isf(0.05/2,df=7041)

***Conclusion***
* p_value < sig value
* stats > crit value --->>>>  reject Ho
* ---->>>>>>>>>>>>>>> tenure is statistically significant in predicting churn

***3.Check whether contract has any dependency with monthly charges***

In [None]:
df['Contract'].unique()

In [None]:
s1 = df[df['Contract']=='Month-to-month']['MonthlyCharges']
s2 = df[df['Contract']=='One year']['MonthlyCharges']
s3 = df[df['Contract']=='Two year']['MonthlyCharges']

In [None]:
dfb=2
dfw=len(df['Contract'])-3

In [None]:
stats.f_oneway(s1,s2,s3)

In [None]:
stats.f.isf(0.05/2,dfb,dfw)

***Conclusion***
* p_value < sig value
* stats > crit value --->>>>  reject Ho
* ---->>>>>>>>>>>>>>> contract has dependency with monthly charges

In [None]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [None]:
pairwise_tukeyhsd(df['MonthlyCharges'],df['Contract']).summary()

Month-to-month and one year pair has same mean,other pairs have different mean(reject Ho is true)

**4.Check tenure has any impact on monthly charges**

In [None]:
stats.pearsonr(df['tenure'],df['MonthlyCharges'])

***Conclusion***
* p_value < sig value--->>> Reject Ho
* stats > crit value --->>>>  reject Ho
* ---->>>>>>>>>>>>>>> tenure has impact on monthly charges

<font color = #00FFFF size = 6px><b>Encoding

In [None]:
df1 = df.copy()
df1.head()

In [None]:
for i in cat:
    print(i)
    print(df[i].unique())
    print('--------------------------')

**One hot encoding**

In [None]:
one_hot = ['InternetService','PaymentMethod']

In [None]:
pd.get_dummies(data = df1, columns=one_hot, dtype=int) # one hot encoding

In [None]:
df1 = pd.get_dummies(data = df1, columns=one_hot, dtype = int, drop_first=True) # N-1 dummy encoding

**Frequency encoding**

In [None]:
freq = df1['gender'].value_counts(normalize = True)

In [None]:
df1['gender'] = df1['gender'].replace(freq)

In [None]:
df1['gender'].unique()

**Ordinal encoding**

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
ordinal = OrdinalEncoder(categories=[['Month-to-month','One year','Two year']])

In [None]:
df1['Contract']=ordinal.fit_transform(df1[['Contract']])

**Label Encoding**

In [None]:
cat1 = ['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup',
 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling',  'Churn']

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
l = LabelEncoder()
for i in cat1:
    df1[i] = l.fit_transform(df1[i])

In [None]:
df1.info()

<font color = #00FFFF size = 6px><b>Transformation</font>

In [None]:
df1[num].skew()

Transform total charges using Yoe johnson  (column has 0 values)

In [None]:
from sklearn.preprocessing import PowerTransformer
p = PowerTransformer()

In [None]:
df['TotalCharges'] = p.fit_transform(df[['TotalCharges']])

In [None]:
df['TotalCharges'].skew()

<font color = #00FFFF size = 6px><b>Scaling</font>

### Standard scaling

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
s = StandardScaler()

df1['tenure'] = s.fit_transform(df1[['tenure']])

In [None]:
m = MinMaxScaler()

df1['MonthlyCharges'] = m.fit_transform(df1[['MonthlyCharges']])