#Customer Churn - Decision tree model using Numpy

In [38]:
import numpy as np
import openml
import pandas as pd

#Load dataset from openml for customer churn
Here I have used the dataset with id 45568

In [39]:
dataset = openml.datasets.get_dataset(45568) #Load data from open ml 
#the loaded data returns not a single value but many like (dataframe, attribute name, class names,...)
#We inly use dataframe which is the 2D table of data we need
#Hence we assign the first returned value dataframe to df and the rest (*) to '_' which is like a throw away variable 
df, *_= dataset.get_data()
print(df.head())#shows some sample rows from the dataframe df

   gender SeniorCitizen Partner Dependents  tenure PhoneService  \
0  Female             0     Yes         No       1           No   
1    Male             0      No         No      34          Yes   
2    Male             0      No         No       2          Yes   
3    Male             0      No         No      45           No   
4  Female             0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity OnlineBackup  \
0  No phone service             DSL             No          Yes   
1                No             DSL            Yes           No   
2                No             DSL            Yes          Yes   
3  No phone service             DSL            Yes           No   
4                No     Fiber optic             No           No   

  DeviceProtection TechSupport StreamingTV StreamingMovies        Contract  \
0               No          No          No              No  Month-to-month   
1              Yes          No        

#Cleaning up the dataset
As we can see the features does not only contain digits but also strin values which makes it difficult for comaprison and categorising, so lets map the strings to digits.
example: for gender- female/male is converted to 1/0
for contract: month to month is 0, one year is 1, two years is 2,...

###Kindly note that it does not matter what value is mapped to which number since we can always store the mapped data

In [40]:
#Used to map string value to integers
def auto_encode(df):
    encodings = {}#store which digit maps to which value for latwer use
    for col in df.columns:
        if df[col].dtype == 'category':# if the samples vlue is a string in the 
            #fill null values with mode
            df[col] = df[col].fillna(df[col].mode()[0])

            #store mappings
            encodings[col] = dict(enumerate(df[col].cat.categories))

            # Multi-category → integer codes
            df[col] = df[col].astype('category').cat.codes# cat codes assign values starting from 0

        elif df[col].dtype == 'float': #fill null float values with their mean
            df[col] = df[col].fillna(df[col].mean())
    return df,encodings

#this can also used directly from sklearn, here we are implementing it manually
def train_test_split(X, y, test_ratio=0.2):
    n_samples = len(X)
    indices = np.arange(n_samples)         # [0, 1, 2, ..., n_samples-1]
    np.random.shuffle(indices)             # Shuffle the order randomly

    test_size = int(n_samples * test_ratio)
    
    test_indices = indices[:test_size]
    train_indices = indices[test_size:]
    
    X_train = X[train_indices]
    y_train = y[train_indices]
    X_test  = X[test_indices]
    y_test  = y[test_indices]
    
    return X_train, y_train, X_test, y_test

#Final cleanup and split data

In [41]:
'''# Checkk all numeric columns safely,the numeric values can be strings like '200' is converted into float
#if the value is a string or nill 'abc' or '' in total charges then they are converted into Nan- not a number by the error='coerce'
for col in df.columns:
    # Try converting each column to numeric
    df[col] = pd.to_numeric(df[col], errors='ignore')

#Drop any row in the dataframe that has NAN values, inline is used so that the same dataframe is edited instead of saving it into another
df.dropna(inplace=True)'''

# Encode automatically, map values to integers
df,encodings = auto_encode(df)

# Prepare values and convert from pandas data frame to numpy array 
X = df.drop(columns=['Churn']).values
y = df['Churn'].values

X_train, y_train, X_test, y_test = train_test_split(X,y)
#Shuffle and split for test and training stes

In [43]:
print(encodings)
print(df)

{'gender': {0: 'Female', 1: 'Male'}, 'SeniorCitizen': {0: '0', 1: '1'}, 'Partner': {0: 'No', 1: 'Yes'}, 'Dependents': {0: 'No', 1: 'Yes'}, 'PhoneService': {0: 'No', 1: 'Yes'}, 'MultipleLines': {0: 'No', 1: 'No phone service', 2: 'Yes'}, 'InternetService': {0: 'DSL', 1: 'Fiber optic', 2: 'No'}, 'OnlineSecurity': {0: 'No', 1: 'No internet service', 2: 'Yes'}, 'OnlineBackup': {0: 'No', 1: 'No internet service', 2: 'Yes'}, 'DeviceProtection': {0: 'No', 1: 'No internet service', 2: 'Yes'}, 'TechSupport': {0: 'No', 1: 'No internet service', 2: 'Yes'}, 'StreamingTV': {0: 'No', 1: 'No internet service', 2: 'Yes'}, 'StreamingMovies': {0: 'No', 1: 'No internet service', 2: 'Yes'}, 'Contract': {0: 'Month-to-month', 1: 'One year', 2: 'Two year'}, 'PaperlessBilling': {0: 'No', 1: 'Yes'}, 'PaymentMethod': {0: 'Bank transfer (automatic)', 1: 'Credit card (automatic)', 2: 'Electronic check', 3: 'Mailed check'}, 'Churn': {0: 'No', 1: 'Yes'}}
      gender  SeniorCitizen  Partner  Dependents  tenure  Pho

##Now we will be implementing the different functions starting frooom the math logics that are required