<a href="https://colab.research.google.com/github/Samuel-Njoroge/Churn_Prediction/blob/main/Churn_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***CHURN PREDICTION.***
Churn - Predicting the rate at which customers stop doing business with an entity or using services.

In [None]:
#Importing libraries
import numpy as pd
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Loading the dataset
df= pd.read_csv('/content/drive/MyDrive/Machine Learning Zoomcamp/WA_Fn-UseC_-Telco-Customer-Churn.csv', on_bad_lines='skip')

In [None]:
len(df)

7043

In [None]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
#Transposing the dataset
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [None]:
#Converting the columns into lower cases
df.columns = df.columns.str.lower().str.replace(' ', '_')
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
  df[c] = df[c].str.lower().str.replace(' ', '_')

In [None]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [None]:
#Checking the data types
df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [None]:
#Converting the column totalcharges into a number
tc = pd.to_numeric(df.totalcharges, errors = 'coerce')

In [None]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors = 'coerce')

In [None]:
#Filling the missing values with a 0
df.totalcharges = df.totalcharges.fillna(0)

In [None]:
df[tc.isnull()][['customerid', 'totalcharges']]

Unnamed: 0,customerid,totalcharges
488,4472-lvygi,0.0
753,3115-czmzd,0.0
936,5709-lvoeq,0.0
1082,4367-nuyao,0.0
1340,1371-dwpaz,0.0
3331,7644-omvmy,0.0
3826,3213-vvolg,0.0
4380,2520-sgtta,0.0
5218,2923-arzlg,0.0
6670,4075-wkniu,0.0


In [None]:
#Checking the column churn
df.churn.head()

0     no
1     no
2    yes
3     no
4    yes
Name: churn, dtype: object

In [None]:
#Replacing the values with numbers where yes = 1 , no = 0
(df.churn == 'yes').astype(int).head()

0    0
1    0
2    1
3    0
4    1
Name: churn, dtype: int64

# **Setting the Validation Framework.**

In [None]:
#Performing the train split using scikitlearn
from sklearn.model_selection import train_test_split

In [None]:
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state = 1)

In [None]:
len(df_full_train), len(df_test)

(5634, 1409)

In [None]:
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state= 1)

In [None]:
len(df_train), len(df_val), len(df_test)

(4225, 1409, 1409)

In [None]:
df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)


In [None]:
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values


In [None]:
del df_train['churn']
del df_val['churn']
del df_test['churn']

# **EDA**
. Check missing values

. Look at the target

. Look at the numerical and categorical variables

In [None]:
df_full_train = df_full_train.reset_index(drop = True)

In [None]:
#Checking missing values
df_full_train.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [None]:
df_full_train.churn.value_counts(normalize = True)

no     0.730032
yes    0.269968
Name: churn, dtype: float64

In [None]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [None]:
df_full_train.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [None]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
        'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

In [None]:
df_full_train[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

# **Feature Importance: Churn rate and risk ratio**

Identifying which features affect our variables.

**Churn rate**

In [None]:
df_full_train.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,5442-pptjy,male,0,yes,yes,12,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.7,258.35,no
1,6261-rcvns,female,0,no,no,42,yes,no,dsl,yes,...,yes,yes,no,yes,one_year,no,credit_card_(automatic),73.9,3160.55,yes
2,2176-osjuv,male,0,yes,no,71,yes,yes,dsl,yes,...,no,yes,no,no,two_year,no,bank_transfer_(automatic),65.15,4681.75,no
3,6161-erdgd,male,0,yes,yes,71,yes,yes,dsl,yes,...,yes,yes,yes,yes,one_year,no,electronic_check,85.45,6300.85,no
4,2364-ufrom,male,0,no,no,30,yes,no,dsl,yes,...,no,yes,yes,no,one_year,no,electronic_check,70.4,2044.75,no


In [None]:
df_group = df_full_train.groupby('gender').churn.agg(['mean', 'count'])

TypeError: ignored

In [None]:
class Pokemon():

 

    def __init__(self, name, type):

        self.name = name

        self.type = type

 

    def stringPokemon(self):

        print(f"Pokemon name is {self.name} and type is {self.type}")

 

class GrassType(Pokemon):

 

    # overrides the stringPokemon() function on 'Pokemon' class

    def stringPokemon(self):

        print(f"Grass type pokemon name is {self.name}")

 

poke1 = GrassType('Bulbasaur', 'Grass')

poke1.stringPokemon

poke1.stringPokemon()

poke2 = Pokemon('Charizard', 'Fire')

poke2.stringPokemon

poke2.stringPokemon()

Grass type pokemon name is Bulbasaur
Pokemon name is Charizard and type is Fire
