
# Task 1 : random forest model
Thawdar Swe Zin (8039276)


## Import libraries

In [1]:
import pandas as pd
import numpy as np
import random
import statistics

## Read in data file

In [2]:
# Reading in dataset and Creating dataframes
churnTest_df = pd.read_csv("customer_churn_dataset-testing-master.csv")
churnTrain_df = pd.read_csv("customer_churn_dataset-training-master.csv")


## Data Preproccessing

### Cleaning null values and unwanted columns

In [3]:
churnTrain_df.dropna(how='any', inplace=True)
churnTrain_df = churnTrain_df.iloc[: , 1:]

# There is no NA in test dataset
churnTest_df = churnTest_df.iloc[: , 1:]

In [4]:
churnTrain_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 440832 entries, 0 to 440832
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Age                440832 non-null  float64
 1   Gender             440832 non-null  object 
 2   Tenure             440832 non-null  float64
 3   Usage Frequency    440832 non-null  float64
 4   Support Calls      440832 non-null  float64
 5   Payment Delay      440832 non-null  float64
 6   Subscription Type  440832 non-null  object 
 7   Contract Length    440832 non-null  object 
 8   Total Spend        440832 non-null  float64
 9   Last Interaction   440832 non-null  float64
 10  Churn              440832 non-null  float64
dtypes: float64(8), object(3)
memory usage: 40.4+ MB


In [5]:
churnTest_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64374 entries, 0 to 64373
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Age                64374 non-null  int64 
 1   Gender             64374 non-null  object
 2   Tenure             64374 non-null  int64 
 3   Usage Frequency    64374 non-null  int64 
 4   Support Calls      64374 non-null  int64 
 5   Payment Delay      64374 non-null  int64 
 6   Subscription Type  64374 non-null  object
 7   Contract Length    64374 non-null  object
 8   Total Spend        64374 non-null  int64 
 9   Last Interaction   64374 non-null  int64 
 10  Churn              64374 non-null  int64 
dtypes: int64(8), object(3)
memory usage: 5.4+ MB


### Converting category to numeric 

In [6]:
# Converting category to binary
contract_categories = churnTrain_df['Contract Length'].unique()
sub_type = churnTrain_df['Subscription Type'].unique()
Age_categories = churnTrain_df['Gender'].unique()

churnTrain_df['Contract Length'].replace(contract_categories,
                        [0, 1 , 2], inplace=True)
churnTrain_df['Subscription Type'].replace(sub_type,
                        [0 , 1 , 2], inplace=True)
churnTrain_df['Gender'].replace(Age_categories,
                        [0 , 1], inplace=True)


churnTest_df['Contract Length'].replace(contract_categories,
                        [0, 1 , 2], inplace=True)
churnTest_df['Subscription Type'].replace(sub_type,
                        [0 , 1 , 2], inplace=True)
churnTest_df['Gender'].replace(Age_categories,
                        [0 , 1], inplace=True)

### Creating a bin range and labels for continuous variables

In [7]:
attribute_to_bin = 'Total Spend' 
# Age is a continuous variable. So, we will be using it.
# Hot encoding is not done after binning because random forest models are non-linear.

churnTrain_df['Total Spend Bin'] = pd.qcut(churnTrain_df['Total Spend'], 5, labels=False) + 1
churnTrain_df['Age Bin'] = pd.qcut(churnTrain_df['Age'], 5, labels=False) + 1

churnTest_df['Total Spend Bin'] = pd.qcut(churnTrain_df['Total Spend'], 5, labels=False) + 1
churnTest_df['Age Bin'] = pd.qcut(churnTrain_df['Age'], 5, labels=False) + 1

# Drop initial column after binning
churnTrain_df.drop('Total Spend', axis=1, inplace=True)
churnTrain_df.drop('Age', axis=1, inplace=True)

churnTest_df.drop('Total Spend', axis=1, inplace=True)
churnTest_df.drop('Age', axis=1, inplace=True)

In [8]:
churnTrain_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 440832 entries, 0 to 440832
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Gender             440832 non-null  int64  
 1   Tenure             440832 non-null  float64
 2   Usage Frequency    440832 non-null  float64
 3   Support Calls      440832 non-null  float64
 4   Payment Delay      440832 non-null  float64
 5   Subscription Type  440832 non-null  int64  
 6   Contract Length    440832 non-null  int64  
 7   Last Interaction   440832 non-null  float64
 8   Churn              440832 non-null  float64
 9   Total Spend Bin    440832 non-null  int64  
 10  Age Bin            440832 non-null  int64  
dtypes: float64(6), int64(5)
memory usage: 56.5 MB


In [9]:
churnTest_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64374 entries, 0 to 64373
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   Gender             64374 non-null  int64
 1   Tenure             64374 non-null  int64
 2   Usage Frequency    64374 non-null  int64
 3   Support Calls      64374 non-null  int64
 4   Payment Delay      64374 non-null  int64
 5   Subscription Type  64374 non-null  int64
 6   Contract Length    64374 non-null  int64
 7   Last Interaction   64374 non-null  int64
 8   Churn              64374 non-null  int64
 9   Total Spend Bin    64374 non-null  int64
 10  Age Bin            64374 non-null  int64
dtypes: int64(11)
memory usage: 5.4 MB


### Convert float to int for **standardlization**

In [13]:
# Function to convert float dtype to integers dtype
def converting_float_to_int(df):
    for column in df.columns:
        if df[column].dtype == 'float64':
            df[column] = df[column].astype(np.uint8)
    return df
churnTrain_df = converting_float_to_int(churnTrain_df)

In [14]:
churnTrain_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 440832 entries, 0 to 440832
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype
---  ------             --------------   -----
 0   Gender             440832 non-null  int64
 1   Tenure             440832 non-null  uint8
 2   Usage Frequency    440832 non-null  uint8
 3   Support Calls      440832 non-null  uint8
 4   Payment Delay      440832 non-null  uint8
 5   Subscription Type  440832 non-null  int64
 6   Contract Length    440832 non-null  int64
 7   Last Interaction   440832 non-null  uint8
 8   Churn              440832 non-null  uint8
 9   Total Spend Bin    440832 non-null  int64
 10  Age Bin            440832 non-null  int64
dtypes: int64(5), uint8(6)
memory usage: 38.8 MB


In [15]:
churnTest_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64374 entries, 0 to 64373
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   Gender             64374 non-null  int64
 1   Tenure             64374 non-null  int64
 2   Usage Frequency    64374 non-null  int64
 3   Support Calls      64374 non-null  int64
 4   Payment Delay      64374 non-null  int64
 5   Subscription Type  64374 non-null  int64
 6   Contract Length    64374 non-null  int64
 7   Last Interaction   64374 non-null  int64
 8   Churn              64374 non-null  int64
 9   Total Spend Bin    64374 non-null  int64
 10  Age Bin            64374 non-null  int64
dtypes: int64(11)
memory usage: 5.4 MB


## Implement Decision Tree Classifier
This approach employs binary splits to efficiently streamline data processing by utilizing techniques like categorical binning and normalization. Based on the prior task, the Z-score normalization was found to be less beneficial for this decision tree model due to its binary splitting nature.

<strong>The simplified general procedure can be summarized as follows:</strong>

1. Determination of best split <br>
   In this implementation, models will make splits based on various measures of impurity and uncertainty, including but not limited to entropy gain ratio, Gini index, and variance, depending on the specific requirements.

2. Induction using the best splitting function.

3. Repeat until we arrive at the leaf node or satisfy a stopping condition.

### The followings functions have been implemented to fulfill our objectives:

1. A function to compute entropy, Gini index, gain ratio, and information gain.
2. A function to determine the optimal split using Gini index, gain ratio, or information gain as specified through a split criterion parameter.
3. A function for prediction and scoring.
4. Establish a Decision Tree (DT) class and instantiate three DT models with user-defined split criteria using the provided functions.

### Decision trees as classifiers
Employing data preparation techniques like categorical binning and normalization can substantially decrease computational overhead and streamline data processing in this approach. Notably, the decision tree model developed in the prior task does not benefit from z-score normalization, given its inherent binary splitting structure.

Here's a streamlined overview of the general process:

1. Selecting the Optimal Splitting Criterion:
   There are different ways to check how mixed or uncertain our data is, like using variance, the Gini index, or the entropy gain ratio. In our setup, we'll split each model based on what works best for the task, which could be the Gini index, Gini ratio, or information gain.

2. Using the Optimal Splitting Function for Induction.

3. Repeating the process until reaching a stopping criterion or the leaf node.

### To achieve our objectives, the following processes are employed:

1. A function for calculating entropy, the Gini index, gain ratio, and information gain.

2. A unified function for determining the optimal split, capable of handling the Gini index, gain ratio, and information gain based on the specified splitting criterion.

3. A function for prediction and scoring.

4. Implementing the decision tree model within a DT class using the newly created functions. Three instances of the DT model are generated based on the user's input for the split criterion.

#### Calculate Entropy function

In [16]:
def entropy(y):
    _, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

#### Create information gain function 

info gain = H(Y) - P(left) * H(Y|left) - P(right) * H(Y|right)

In [17]:
def information_gain(y_parent, y_left, y_right):
    parent_entropy = entropy(y_parent)
    left_entropy = entropy(y_left)
    right_entropy = entropy(y_right)

    parent_weight = len(y_parent) / (len(y_left) + len(y_right))
    left_weight = len(y_left) / len(y_parent)
    right_weight = len(y_right) / len(y_parent)

    information_gain = parent_entropy - (parent_weight * left_entropy) - (parent_weight * right_entropy)
    return information_gain

#### Create gain_ratio function 

gain ratio = information gain / splitInfo <br>
split info = -sum(P(subset) * log2(P(subset)))

1. Compute the information gain and split information for every feature.
2. Contrast the gain ratios for each feature.
3. Choose the feature with the highest gain ratio as the one for splitting.


In [18]:
def gain_ratio(y_parent, y_left, y_right):
    info_gain = information_gain(y_parent, y_left, y_right)
    parent_entropy = entropy(y_parent)

    split_information = entropy(np.concatenate([y_left, y_right]))
    if split_information == 0:
        return 0  # To avoid division by zero

    gain_ratio = info_gain / split_information
    return gain_ratio

#### Create gini index function

In [19]:
def gini_index(y_parent, y_left, y_right):
    parent_gini = gini(y_parent)
    left_gini = gini(y_left)
    right_gini = gini(y_right)

    parent_weight = len(y_parent) / (len(y_left) + len(y_right))
    left_weight = len(y_left) / len(y_parent)
    right_weight = len(y_right) / len(y_parent)

    gini_index = parent_gini - (parent_weight * left_gini) - (parent_weight * right_gini)
    return gini_index

def gini(y):
    _, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    gini = 1 - np.sum(probabilities ** 2)
    return gini

#### Create find_best_split function for info gain, gain ratio and gini index

In [20]:
def find_best_split(X, y, split_criterion='information_gain'):
    best_split_feature = None
    best_split_value = None
    best_gain = -1

    for feature_idx in range(X.shape[1]):
        feature_values = X[:, feature_idx]
        unique_values = np.unique(feature_values)
        for value in unique_values:
            left_indices = feature_values < value
            right_indices = ~left_indices

            if split_criterion == 'information_gain':
                current_gain = information_gain(y, y[left_indices], y[right_indices])
            elif split_criterion == 'gain_ratio':
                current_gain = gain_ratio(y, y[left_indices], y[right_indices])
            elif split_criterion == 'gini_index':
                current_gain = gini_index(y, y[left_indices], y[right_indices])
            else:
                raise ValueError("Invalid split criterion. Supported options are: 'information_gain', 'gain_ratio', and 'gini_index'.")

            if current_gain > best_gain:
                best_gain = current_gain
                best_split_feature = feature_idx
                best_split_value = value

    return best_split_feature, best_split_value

#### Create class to implement all the functions to for a tree and predict

In [21]:
class DecisionTree:
    def __init__(self, split_criterion='information_gain', max_depth=None, min_samples_split=2):
        self.split_criterion = split_criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y, depth=0):
        self.tree = self._build_tree(X, y, depth)

    def predict(self, X):
        if self.tree is None:
            raise ValueError("Model has not been trained yet. Call 'fit' before 'predict'.")

        return np.array([self._predict_sample(sample, self.tree) for sample in X])

    def _build_tree(self, X, y, depth):
        if len(np.unique(y)) == 1:
            # If all labels are the same, return a leaf node with the label
            return np.unique(y)[0]

        if self.max_depth is not None and depth >= self.max_depth:
            # If the maximum depth is reached, return the most common label
            return np.bincount(y).argmax()

        if X.shape[0] < self.min_samples_split:
            # If the number of samples is below the minimum required for splitting, return the most common label
            return np.bincount(y).argmax()

        if X.shape[1] == 0:
            # If there are no features left, return the most common label
            return np.bincount(y).argmax()

        best_split_feature, best_split_value = find_best_split(X, y, self.split_criterion)
        left_indices = X[:, best_split_feature] < best_split_value
        right_indices = ~left_indices

        # Check if left or right subtree has no samples
        if np.all(left_indices) or np.all(right_indices):
            return np.bincount(y).argmax()

        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return (best_split_feature, best_split_value, left_subtree, right_subtree)

    def _predict_sample(self, sample, node):
        if not isinstance(node, tuple):
            return node

        split_feature, split_value, left_subtree, right_subtree = node
        if sample[split_feature] < split_value:
            return self._predict_sample(sample, left_subtree)
        else:
            return self._predict_sample(sample, right_subtree)

Create the ensemble model.

We merge three DT models—Info Gain, Gini Index, and Gain Ratio—constructed using diverse splitting criteria, which may lead to longer processing times but improved accuracy. 

The ensemble model will employ a simple voting method to decide the final prediction's outcome.

The ultimate forecast from the ensemble model will be determined by majority vote. For instance, if two DT models and one DT model all predict that an individual will churn (churn 1), the final prediction will also be churn 1 due to the majority consensus.



In [22]:
class DecisionTreeEnsemble:
    def __init__(self, models):
        self.models = models

    def fit(self, X, y):
        for model in self.models:
            model.fit(X, y)

    def predict(self, X):
        predictions = [model.predict(X) for model in self.models]
        return self._majority_vote(predictions)

    def _majority_vote(self, predictions):
        final_predictions = np.zeros(predictions[0].shape)
        for pred in predictions:
            final_predictions += pred

        final_predictions = (final_predictions >= (len(self.models) / 2)).astype(int)
        return final_predictions

#### Evaluation function 

In [42]:
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

def precision(y_true, y_pred):
    true_positive = np.sum((y_true == 1) & (y_pred == 1))
    false_positive = np.sum((y_true == 0) & (y_pred == 1))
    return true_positive / (true_positive + false_positive)

def recall(y_true, y_pred):
    true_positive = np.sum((y_true == 1) & (y_pred == 1))
    false_negative = np.sum((y_true == 1) & (y_pred == 0))
    return true_positive / (true_positive + false_negative)

def f1_score(y_true, y_pred):
    prec = precision(y_true, y_pred)
    rec = recall(y_true, y_pred)
    return 2 * (prec * rec) / (prec + rec)

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    acc = accuracy(y_test, y_pred)
    prec = precision(y_test, y_pred)
    rec = recall(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    return acc, prec, rec, f1

#### Implementation of random forest models created

##### Step 1: Extract features and target variable from DataFrames
1.Implementation assumes data is preprocessed

In [23]:
# Convert the DataFrame to numpy arrays
X_train = churnTrain_df.drop('Churn', axis=1)
y_train = churnTrain_df['Churn']
X_test = churnTest_df.drop('Churn', axis=1)
y_test = churnTest_df['Churn']

X_train = churnTrain_df.values
y_train = churnTrain_df.values
X_test = churnTest_df.values
y_test = churnTest_df.values

Step 2: Create and train each of the three Decision Tree models using a unique split criteria.

In [24]:
# Create Decision Tree models
dt_info_gain = DecisionTree(split_criterion='information_gain', max_depth=2, min_samples_split=6)
dt_gain_ratio = DecisionTree(split_criterion='gain_ratio', max_depth=2, min_samples_split=6)
dt_gini_index = DecisionTree(split_criterion='gini_index', max_depth=2, min_samples_split=6)
ensemble = DecisionTreeEnsemble(models=[dt_info_gain, dt_gain_ratio, dt_gini_index])

In [45]:
dt_info_gain.fit(X_train, y_train)
dt_gain_ratio.fit(X_train, y_train)
dt_gini_index.fit(X_train, y_train)
ensemble.fit(X_train, y_train)

In [46]:
models = [dt_info_gain, dt_gain_ratio, dt_gini_index, ensemble]
model_names = ["Information Gain DT", "Gain Ratio DT", "Gini Index DT", "Ensemble"]

for model, name in zip(models, model_names):
    acc, prec, rec, f1 = evaluate_model(model, X_test, y_test)
    print(f"Metrics for {name}:")
    print(f"Accuracy: {acc:.2f}")
    print(f"Precision: {prec:.2f}")
    print(f"Recall: {rec:.2f}")
    print(f"F1-score: {f1:.2f}")
    print()

Metrics for Information Gain DT:
Accuracy: 0.58
Precision: 0.54
Recall: 0.78
F1-score: 0.64

Metrics for Gain Ratio DT:
Accuracy: 0.58
Precision: 0.54
Recall: 0.78
F1-score: 0.64

Metrics for Gini Index DT:
Accuracy: 0.60
Precision: 0.55
Recall: 0.86
F1-score: 0.67

Metrics for Ensemble:
Accuracy: 0.58
Precision: 0.54
Recall: 0.78
F1-score: 0.64



In [47]:
models = [dt_info_gain, dt_gain_ratio, dt_gini_index, ensemble]
model_names = ["Information Gain DT", "Gain Ratio DT", "Gini Index DT", "Ensemble"]

for model, name in zip(models, model_names):
    acc, prec, rec, f1 = evaluate_model(model, X_train, y_train)
    print(f"Metrics for {name}:")
    print(f"Accuracy: {acc:.2f}")
    print(f"Precision: {prec:.2f}")
    print(f"Recall: {rec:.2f}")
    print(f"F1-score: {f1:.2f}")
    print()

Metrics for Information Gain DT:
Accuracy: 0.81
Precision: 1.00
Recall: 0.67
F1-score: 0.80

Metrics for Gain Ratio DT:
Accuracy: 0.81
Precision: 1.00
Recall: 0.67
F1-score: 0.80

Metrics for Gini Index DT:
Accuracy: 0.84
Precision: 0.99
Recall: 0.73
F1-score: 0.84

Metrics for Ensemble:
Accuracy: 0.81
Precision: 1.00
Recall: 0.67
F1-score: 0.80



## Implementing a random forest

I'm using Bagging, a standard technique in Random Forests. It involves training multiple decision trees on random data subsets, and the final prediction is based on averaging (for regression) or majority vote (for classification) from these trees.

The dataset is relatively straightforward and has undergone comprehensive cleaning. Employing a random forest in this context may not yield significant benefits.


In [None]:
class RandomForestFromScratch:
    def __init__(self, decision_tree, n_estimators=100, max_samples=None, max_depth=None, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.decision_tree = decision_tree
        self.estimators = []

    def fit(self, X, y):
        for _ in range(self.n_estimators):
            estimator = self.decision_tree  # Use the provided Decision Tree instance
            if self.max_samples is not None:
                bootstrap_indices = np.random.choice(len(X), size=self.max_samples, replace=True)
                X_bootstrap = X[bootstrap_indices]
                y_bootstrap = y[bootstrap_indices]
                estimator.fit(X_bootstrap, y_bootstrap)
            else:
                estimator.fit(X, y)

            self.estimators.append(estimator)

    def predict(self, X):
        predictions = np.array([estimator.predict(X) for estimator in self.estimators])
        return np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions)


# Create a RandomForest classifier based on the Decision Tree model using Information Gain
rf_gini_index = RandomForestFromScratch(decision_tree=dt_gini_index, n_estimators=5, max_depth=2, min_samples_split=6)

# Fit the model
rf_gini_index.fit(X_train, y_train)

# Make predictions
y_pred_rf_gini_index = rf_gini_index.predict(X_test)

# Evaluate the model using the functions provided
accuracy_rf_gini_index, precision_rf_gini_index, recall_rf_gini_index, f1_rf_gini_index = evaluate_model(rf_gini_index, X_test, y_test)

# Print evaluation results
print("Random Forest Classifier (Gini index):")
print(f"Accuracy: {accuracy_rf_gini_index:.4f}")
print(f"Precision: {precision_rf_gini_index:.4f}")
print(f"Recall: {recall_rf_gini_index:.4f}")
print(f"F1-Score: {f1_rf_gini_index:.4f}")
