# Import Libraries

In [None]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows',100)
pd.set_option('display.max_rows',100)
%matplotlib inline
sns.set()

In [None]:
# read the data
Test_data=pd.read_csv('/content/drive/MyDrive/data/Telecom_Test.csv')
Train_data = pd.read_csv('/content/drive/MyDrive/data/Telecom_Train.csv')

# EDA

In [None]:
df_train=Test_data.copy()

In [None]:
df_test=Train_data.copy()

In [None]:
df_train.head()

In [None]:
df_test

In [None]:
# drop unwanted column
df_train.drop('Unnamed: 0', axis=1, inplace=True)
df_test.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
df_train.info()

In [None]:
# check duplicated values
df_train.duplicated().sum()

In [None]:
# Check null values
df_train.isnull().sum()

In [None]:
# plot the graph with null values
sns.heatmap(df_train.isnull())
plt.show()

In [None]:
# Use describe function to generate descriptive statistics
df_train.describe()

In [None]:
# Lets see unique values in each variable
df_train.nunique().sort_values()

In [None]:
df_train['area_code'].value_counts()

In [None]:
df_train.dtypes.sort_values()

In [None]:
df_train['churn'].value_counts()*100/len(df_train)


### Correlation


In [None]:
# Drop non-numeric columns
df_train_numeric = df_train.select_dtypes(include=['number'])

# Calculate the correlation matrix
corr_matrix = df_train_numeric.corr()

# Plot the heatmap
plt.figure(figsize=(15, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Continuous Variables')
plt.show()


##### As we can see, there is multicollinearity. Let's drop the columns 'total_intl_minutes', 'total_night_minutes', 'total_eve_minutes', 'total_day_minutes', and 'number_vmail_messages'.

In [None]:
df_train.drop(['total_intl_minutes', 'total_night_minutes', 'total_eve_minutes', 'total_day_minutes', 'number_vmail_messages'], axis=1, inplace=True)
df_test.drop(['total_intl_minutes', 'total_night_minutes', 'total_eve_minutes', 'total_day_minutes', 'number_vmail_messages'], axis=1, inplace=True)

In [None]:
df_train.shape

## Univariate analysis

In [None]:
# Visualizing the churn variable
Churn = df_train['churn'].value_counts(sort = True)
colorss = ["pink","blue"]
plt.pie(Churn,labels = Churn.index.values, explode= [0, 0.2],  colors=colorss, autopct='%1.1f%%', shadow=True, startangle=90)
plt.title('%Churn in Training Data')
plt.show()

In [None]:
# Define the DataFrame 'cap' containing the columns to plot
num_var = df_train[['account_length', 'total_day_calls', 'total_day_charge',
          'total_eve_calls', 'total_eve_charge', 'total_night_calls',
          'total_night_charge', 'total_intl_calls', 'total_intl_charge',
          'number_customer_service_calls']]

# Define a function to plot distribution plots
def distplots(col):
    sns.displot(num_var[col])
    plt.show()

# Iterate over each column in 'cap' and plot its distribution
for col in num_var.columns:
    distplots(col)


In [None]:
sns.boxplot(y='total_night_charge', data=df_train)

In [None]:
sns.boxplot(y='total_intl_calls', data=df_train)

### Boxplot

In [None]:
plt.figure(figsize=(25,6))
sns.boxplot(data=num_var)

##### There are outliers; we can use the capping approach

In [None]:
# Data wrangling
print('No. of customers churning:', df_train[df_train['churn']=='yes'].churn.count())
churn_df = df_train[df_train['churn']=='yes']
#print('No. of customers churning:', df_test[df_test['churn']=='yes'].churn.count())
print('No of unique account_length:', df_train['account_length'].nunique())

churn_df

In [None]:
# % of total customer churning train data
print('Total no. of customer:', df_train.churn.count())
perc_churn = (len(churn_df)/len(df_train))*100
print(f'percentage od custonmer churning:{round(perc_churn,2)}%')

In [None]:
print('No. of customers churning:', df_test[df_test['churn']=='yes'].churn.count())
churn_df_test = df_test[df_test['churn']=='yes']

In [None]:
# % of total customer churning test data
print('Total no. of customer:', df_test.churn.count())
perc_churn_test = (len(churn_df_test)/len(df_test))*100
print(f'percentage od custonmer churning:{round(perc_churn_test,2)}%')

In [None]:
# binary categorical variables
cat_var = ['international_plan','voice_mail_plan','churn']

In [None]:
df_train[cat_var] = df_train[cat_var].replace({'yes':1, 'no':0})

In [None]:
df_test[cat_var] = df_test[cat_var].replace({'yes':1, 'no':0})

In [None]:
# Extract numeric part of the area code
df_train['area_code'] = df_train['area_code'].str.split('_').str[-1]
df_test['area_code'] = df_test['area_code'].str.split('_').str[-1]

In [None]:
# convert object to integer type
df_train['area_code']=df_train['area_code'].astype(int)
df_test['area_code'] =df_test['area_code'] .astype(int)


In [None]:
# Apply one-hot encoding to the categorical column
df_train_encoded = pd.get_dummies(df_train, columns=['state'], prefix='object')
df_test_encoded = pd.get_dummies(df_test, columns=['state'], prefix='object')



In [None]:
df_train_encoded = df_train_encoded.astype(int)
df_test_encoded = df_test_encoded.astype(int)

In [None]:
df_train_encoded.shape

In [None]:
df_test_encoded.shape

# Model Building

In [None]:
# split the data
X_train = df_train_encoded.drop('churn',axis=1)
Y_train = df_train_encoded['churn']
X_test = df_test_encoded.drop('churn', axis = 1)
Y_test = df_test_encoded['churn']

In [None]:
X_train.shape, X_test.shape

In [None]:
Y_train.shape, Y_test.shape

### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_scaled =scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [None]:
X_train_df =pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [None]:
X_test_df

###Feature Selection

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_label = RandomForestClassifier()

# Fit the classifier to the training data
rf_label.fit(X_train_df, Y_train)

# Create a DataFrame to store feature importances
df1 = pd.DataFrame({"Feature": X_train_df.columns, "RF_importance": rf_label.feature_importances_})

# Sort the DataFrame by feature importance in descending order
df1 = df1.sort_values(by="RF_importance", ascending=False)

# Display the DataFrame
print(df1)


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
GB_label = GradientBoostingClassifier()

# Fit the classifier to the training data
GB_label.fit(X_train_df, Y_train)

# Create a DataFrame to store feature importances
df2 = pd.DataFrame({"Feature": X_train_df.columns, "RF_importance": GB_label.feature_importances_})

# Sort the DataFrame by feature importance in descending order
df2 = df1.sort_values(by="RF_importance", ascending=False)

# Display the DataFrame
print(df2)

#### Per RF and GB 'area code' less important feature

In [None]:
# imbalance treatment for train data
import imblearn
from imblearn.over_sampling import SMOTE
smote = SMOTE()
x_smote, y_smote = smote.fit_resample(X_train_df,Y_train)
print('Original Value Counts:', Y_train.value_counts())
print('********** SMOTE Method********')
print("After Smote Value Count:", y_smote.value_counts())

In [None]:
# imbalance treatment for test data
import imblearn
from imblearn.over_sampling import SMOTE
smote = SMOTE()
x_smote_test, y_smote_test = smote.fit_resample(X_test_df,Y_test)
print('Original Value Counts:', Y_test.value_counts())
print('********** SMOTE Method********')
print("After Smote Value Count:", y_smote_test.value_counts())

#Traditional Machine Learning Algorithms (RandomForest, XGBoost)

#Advanced Techniques (Deep Learning Algorithm,H2O.AI Auto Method - automation method)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, Y_train)
y_pred_train = rf.predict(X_train)
y_pred_test = rf.predict(X_test)


# Evaluation matrix

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
print('****** classification_report*****')
print(classification_report(Y_train,y_pred_train))
print(classification_report(Y_test,y_pred_test))

print('****** accuracy_score ***********')
print(accuracy_score(Y_train,y_pred_train))
print(accuracy_score(Y_test,y_pred_test))




In [None]:
from xgboost import XGBClassifier
XGB = XGBClassifier()
XGB.fit(X_train, Y_train)
y_pred_train_xgb = XGB.predict(X_train)
y_pred_test_xgb = XGB.predict(X_test)


# Evaluation matrix

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
print('****** classification_report*****')
print(classification_report(Y_train,y_pred_train_xgb))
print(classification_report(Y_test,y_pred_test_xgb))

print('****** accuracy_score ***********')
print(accuracy_score(Y_train,y_pred_train_xgb))
print(accuracy_score(Y_test,y_pred_test_xgb))

In [None]:
# CrossValidation method
from sklearn.model_selection import cross_val_score
training_accuracy = cross_val_score(XGB, X_train, Y_train, cv=10)
print(training_accuracy.mean())
print("*********************")
print(training_accuracy.max())

In [None]:
# CrossValidation method
from sklearn.model_selection import cross_val_score
training_accuracy = cross_val_score(rf, X_train, Y_train, cv=10)
print(training_accuracy.mean())
print("*********************")
print(training_accuracy.max())

# Deep Learning Method_MultiLayer Perceptron Method

In [None]:


import tensorflow as tf
from tensorflow import keras
from keras.layers import *
from keras import  Sequential

In [None]:
model = Sequential()
model.add(Dense(32,input_shape=(64,), activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train,Y_train, batch_size=32, epochs=100, validation_data=(X_test,Y_test))

# H2O AutoML Model

In [None]:
!pip install requests
!pip install tabulate
! pip install "colorama>=0.3.8"
! pip install future

In [None]:
! pip install h2o

In [None]:
import h2o
from h2o.automl import H2OAutoML
h2o.init(max_mem_size='16G')

In [None]:
# loading the dataset
dataset = h2o.import_file('/content/drive/MyDrive/data/Telecom_Train.csv')
dataset.head()

In [None]:
df_train, df_test = dataset.split_frame(ratios=[0.8])
df_train

In [None]:
df_test

In [None]:
y ='churn'
x=dataset.columns
x.remove(y)
x.remove('C1')


In [None]:
x

In [None]:
# buikding h2o AutoMl model


In [None]:
aml=H2OAutoML(max_runtime_secs=300, max_models=10, seed=10, verbosity='info', nfolds=2, )

In [None]:
aml.train(x=x,y=y, training_frame=df_train)

In [None]:
lb = aml.leaderboard

In [None]:
lb

In [None]:
model_ids = list(aml.leaderboard['model_id'].as_data_frame().iloc[:,0])

In [None]:
model_ids

In [None]:
aml.leader.model_performance(df_test)

In [None]:
output = h2o.get_model([mid for mid in model_ids if 'StackedEnsemble' in mid][0])

In [None]:
output

In [None]:
aml.leader

In [None]:
y_pred = aml.leader.predict(df_test)

In [None]:
y_pred