## Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Data Loading and Exploration

#### Loading the data

In [None]:
path = '/kaggle/input/sunbase-data-ml-intern-assignment/customer_churn_large_dataset.csv'
data = pd.read_csv(path)
data.head()

#### Summary statistics

In [None]:
# Simple output of the train dataset to view some of the min/max/varition of the dataset features.

data.describe()

#### Data-types of columns

In [None]:
# information regarding datatypes of columns in dataset

data.info()

#### CHECKING FOR NULL VALUES: Since there are no missing values, therefore we do not need imputations using mean, mode etc.

In [None]:
# Checking for null values in dataset

sns.heatmap(data.isnull(),yticklabels=False,cbar=False, cmap='YlGnBu')

print(data.isnull().sum())

#### Dataset shape

In [None]:
data.shape

#### Dataset HISTOGRAM PLOTS: From the histogram plots, it seems that there are no outliers, so this question can also be ignored.

In [None]:
data.hist(figsize=(8,8))
plt.show()

#### No of unique values in each column

In [None]:
print(len(pd.unique(data['Location'])))

In [None]:
locations = []
for loc in data['Location']:
    if loc not in locations:
        locations.append(loc)
print(locations)

In [None]:
# no. of unique values in each column

data.nunique()

## Data Preparation

#### CustomerID and Name do not influence the customer churn, so they can be dropped from our dataset

In [None]:
## Dropping unnecessary columns


data = data.drop(columns=['CustomerID', 'Name'])

In [None]:
data.info()

In [None]:
data.head()

#### Percentage of people churned by location

In [None]:
## Location = Los Angeles

churned_LA = data.Churn[(data.Location == 'Los Angeles') & (data.Churn == 1)].count()
count_LA = data.Churn[data.Location == 'Los Angeles'].count()
print("Percent of People Who Churned from Los Angeles --->", churned_LA*100/count_LA,'%')




## Location = New York

churned_NY = data.Churn[(data.Location == 'New York') & (data.Churn == 1)].count()
count_NY = data.Churn[data.Location == 'New York'].count()
print("Percent of People Who Churned from New York --->", churned_NY*100/count_NY,'%')





## Location = Miami

churned_Mi = data.Churn[(data.Location == 'Miami') & (data.Churn == 1)].count()
count_Mi = data.Churn[data.Location == 'Miami'].count()
print("Percent of People Who Churned from Miami --->", churned_Mi*100/count_Mi,'%')





## Location = Chicago

churned_Chi = data.Churn[(data.Location == 'Chicago') & (data.Churn == 1)].count()
count_Chi = data.Churn[data.Location == 'Chicago'].count()
print("Percent of People Who Churned from Chicago --->", churned_Chi*100/count_Chi,'%')







## Location = Houston

churned_H = data.Churn[(data.Location == 'Houston') & (data.Churn == 1)].count()
count_H = data.Churn[data.Location == 'Houston'].count()
print("Percent of People Who Churned from Houston --->", churned_H*100/count_H,'%')

#### Percentage of people churned by gender

In [None]:
## Gender = Male

churned_m = data.Churn[(data.Gender == 'Male') & (data.Churn == 1)].count()
count_m = data.Churn[data.Gender == 'Male'].count()
print("Percent of Males Who Churned --->", churned_m*100/count_m,'%')







## Gender = Female

churned_f = data.Churn[(data.Gender == 'Female') & (data.Churn == 1)].count()
count_f = data.Churn[data.Gender == 'Female'].count()
print("Percent of Females Who Churned --->", churned_f*100/count_f,'%')

#### 1. Gender is a categorical data, so we should go for label encoding.

#### 2. There are 5 unique locations --- 'Los Angeles', 'New York', 'Miami', 'Chicago', 'Houston' ---- so locations can also be label encoded.

In [None]:
from sklearn.preprocessing import StandardScaler,LabelEncoder

#### Label Encoding on Gender and Location columns

In [None]:
#Convering Categorical Features into numerical features using LabelEncoder
data['Gender'] = LabelEncoder().fit_transform(data['Gender'])
data['Location'] = LabelEncoder().fit_transform(data['Location'])

In [None]:
# One-hot encoding on Gender Column


#data_encoded = pd.get_dummies(data, columns=['Gender'], prefix=['Gender'])
#data_encoded.head()



# One-hot encoding on Location column

#data_encoded = pd.get_dummies(data_encoded, columns=['Location'], prefix=['Location'])
#data_encoded.head()

In [None]:
data.info()

#### HeatMap Analysis to check for correlation among the columns

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(data=data.corr(), annot=True,cmap = "GnBu")

## Split dataset into train-test

In [None]:
data.info()

In [None]:
data

In [None]:
# shuffle all rows

data_encoded = data.sample(frac=1, random_state=42)

In [None]:
data_encoded.shape

In [None]:
X = data_encoded.drop('Churn', axis=1)
y = data_encoded['Churn']




from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score


#test size 20% and train size 80%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)



#from sklearn.preprocessing import StandardScaler
#ss = StandardScaler()
#X_train = ss.fit_transform(X_train)
#X_test = ss.transform(X_test)

## Building ML models

### 1. Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier


dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
y_pred = dtree.predict(X_test)
print("Accuracy Score :", accuracy_score(y_test, y_pred)*100, "%")

### 2. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
print("Accuracy Score :", accuracy_score(y_test, y_pred)*100, "%")

### 3. Support Vector Machine

In [None]:
from sklearn import svm
svm = svm.SVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print("Accuracy Score :", accuracy_score(y_test, y_pred)*100, "%")

### 4. MLP

In [None]:
X_train.shape[1]

In [None]:
import tensorflow as tf
from tensorflow import keras





# Custom accuracy metric
def custom_accuracy(y_true, y_pred):
    y_pred_binary = tf.round(y_pred)
    return tf.keras.metrics.binary_accuracy(y_true, y_pred_binary)

# Custom precision metric
def custom_precision(y_true, y_pred):
    y_pred_binary = tf.round(y_pred)
    true_positives = tf.math.reduce_sum(y_true * y_pred_binary)
    predicted_positives = tf.math.reduce_sum(y_pred_binary)
    return true_positives / (predicted_positives + tf.keras.backend.epsilon())

# Custom recall metric
def custom_recall(y_true, y_pred):
    y_pred_binary = tf.round(y_pred)
    true_positives = tf.math.reduce_sum(y_true * y_pred_binary)
    actual_positives = tf.math.reduce_sum(y_true)
    return true_positives / (actual_positives + tf.keras.backend.epsilon())

# Custom F1-score metric
def custom_f1(y_true, y_pred):
    precision = custom_precision(y_true, y_pred)
    recall = custom_recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + tf.keras.backend.epsilon()))

In [None]:
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



model = keras.Sequential([
    keras.layers.Input(shape=(X_train.shape[1],)),  
    keras.layers.Dense(8, activation='relu'),      
    keras.layers.Dense(4, activation='relu'),       
    keras.layers.Dense(1, activation='sigmoid')      # Output layer with 1 neuron and sigmoid activation
])



learning_rate = 0.0001  
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)


# Compile the model
# model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=[custom_accuracy, custom_precision, custom_recall, custom_f1])
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])




# Train the model on the training data
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Predict churn on the test data
y_pred = model.predict(X_test)

In [None]:
# Calculate and print accuracy score


accuracy = accuracy_score(np.asarray(y_test, dtype = int), np.asarray(y_pred, dtype = int))
print("Accuracy Score:", accuracy * 100, "%")

In [None]:
#import pickle
#pickle_out = open("model.pkl", "wb")
#pickle.dump(model, pickle_out)
#pickle_out.close()

In [None]:
model.save("model.h5")

In [None]:
model = tf.keras.models.load_model("/kaggle/working/model.h5")

In [None]:
Age = 24
Gender = 1
Location = 2
Subscription_Length_Months = 21
Monthly_Bill = 1000
Total_Usage_GB = 1200

prediction = model.predict([[Age, Gender, Location, Subscription_Length_Months, Monthly_Bill, Total_Usage_GB]])

In [None]:
int(prediction[0][0])

In [None]:
if prediction[0][0] >= 0.5:
    result = 1
else:
    result = 0
result