In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

In [2]:
churn_df = pd.read_csv('../input/telecom-churn-datasets/churn-bigml-80.csv')
churn_df.head()

In [3]:
churn_df.Churn.value_counts()

In [4]:
churn_df.info()

In [5]:
churn_df.describe()

In [6]:
churn_df['Area code'].value_counts()

In [7]:
#Basic Visualization
sns.displot(data = churn_df, x='Account length', hue='Churn', kind='kde')

Normally Distributed

In [8]:
sns.displot(data = churn_df, x='State', hue='Churn',multiple="stack",height=8,aspect=2)
#plt.xticks(rotation=45)
sns.displot(data = churn_df, x='Area code', hue='Churn',multiple="stack",height=8,aspect=2)

In [9]:
sns.displot(data = churn_df, x='International plan', hue='Churn', multiple='stack')
sns.displot(data = churn_df, x='Voice mail plan', hue='Churn', kind='hist',multiple='stack')

Churn rate comparatively higher for International plan

In [10]:
sns.displot(data = churn_df, x='Number vmail messages', hue='Churn', kind='kde')

Normal Distribution with 2 peaks

In [11]:
sns.displot(data = churn_df, x='Total day minutes', hue='Churn', kind='kde')
sns.displot(data = churn_df, x='Total day calls', hue='Churn', kind='kde')
sns.displot(data = churn_df, x='Total day charge', hue='Churn', kind='kde')

In [12]:
sns.displot(data = churn_df, x='Total eve minutes', hue='Churn', kind='kde')
sns.displot(data = churn_df, x='Total eve calls', hue='Churn', kind='kde')
sns.displot(data = churn_df, x='Total eve charge', hue='Churn', kind='kde')

In [13]:
sns.displot(data = churn_df, x='Total night minutes', hue='Churn', kind='kde')
sns.displot(data = churn_df, x='Total night calls', hue='Churn', kind='kde')
sns.displot(data = churn_df, x='Total night charge', hue='Churn', kind='kde')

In [14]:
sns.displot(data = churn_df, x='Total intl minutes', hue='Churn', kind='kde')
sns.displot(data = churn_df, x='Total intl calls', hue='Churn', kind='kde')
sns.displot(data = churn_df, x='Total intl charge', hue='Churn', kind='kde')

In [15]:
corr_matrix=churn_df.corr()
corr_matrix['Churn'].sort_values(ascending=False)

In [16]:
attributes = ['Account length','Number vmail messages','Total day minutes','Total day calls','Total day charge','Total eve minutes','Total eve calls','Total eve charge',
              'Total night minutes','Total night calls','Total intl minutes','Total intl calls','Total intl charge','Customer service calls']
axes=scatter_matrix(churn_df[attributes],figsize=(20,20))

for ax in axes.flatten():
    ax.xaxis.label.set_rotation(90)
    ax.yaxis.label.set_rotation(0)
    ax.yaxis.label.set_ha('right')

plt.show()

In [17]:
fig, ax = plt.subplots(figsize=(20,10))
ax=sns.heatmap(churn_df.corr(),annot=True,linewidths=.5)

In [18]:
intl_plan=churn_df[(churn_df['Churn']==True) & (churn_df['International plan']=='Yes')]
reg_plan = churn_df[(churn_df['Churn']==True) & (churn_df['International plan']=='No')]

In [19]:
intl_plan['State'].value_counts().sort_values(ascending=False)

In [20]:
reg_plan['State'].value_counts().count()

In [21]:
churn_df['State'].value_counts().sort_values(ascending=False)

**Train some model with class imbalance
#SMOTE technique to balance dataset 
#Compare the results**

In [22]:
#Data Preparation 
#Splitting data in train and test split
y = churn_df['Churn'].apply(lambda x: 1 if x==True else 0)
churn_df=churn_df.drop(['Churn'],axis=1)
X_train, y_train, X_val, y_val = train_test_split(churn_df, y, test_size=0.2, random_state=42)

In [23]:
X_train.head()

In [24]:
#Function to handle catagorical columns
def cat_conv_intl(X):
    dummy_intl= pd.get_dummies(X['International plan'])
    return dummy_intl['Yes']

In [35]:
#Encoding State column 
def cat_conv_state(X):
    dummy_state = pd.get_dummies(X['State'])
    return dummy_state

In [40]:
ordinal_attribs = ['State','Area code','Number vmail messages']
X_train['intl_plan']=cat_conv_intl(X_train)

X_train.append(cat_conv_state(X_train))
X_train
#Area code encoding 
#ac_encoder = OrdinalEncoder()