In [None]:
#Import some libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sb
sb.set_style('darkgrid')
rcParams['figure.figsize'] = 8,8
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
import tensorflow as tf

In [None]:
#import data
train = pd.read_csv('Train.csv')
test=  pd.read_csv('Test.csv')
submission = pd.read_csv('SampleSubmission.csv')

In [None]:
train.head()

In [None]:
train.tail()

In [None]:
train.shape

In [None]:
#view the data types in the train data
train.info()

In [None]:
test.head()

In [None]:
test.tail()

In [None]:
test.shape

In [None]:
test.info()

In [None]:
submission.head()

In [None]:
submission.shape

In [None]:
#Check how many levels are involved in each of the categorical features (object)

In [None]:
print(train['REGION'].value_counts())
plt.figure(figsize=(10,5))
train['REGION'].value_counts(normalize=True).plot(kind='bar')
plt.ylabel('counts')
plt.xlabel('REGION')

In [None]:
print(train['TENURE'].value_counts())
plt.figure(figsize=(10,5))
train['TENURE'].value_counts(normalize=True).plot(kind='bar')
plt.ylabel('counts')
plt.xlabel('TENURE')

In [None]:
print(train['MRG'].value_counts())
plt.figure(figsize=(10,5))
train['MRG'].value_counts(normalize=True).plot(kind='bar')
plt.ylabel('counts')
plt.xlabel('MRG')

In [None]:
#probaly not the best way to visualize this
print(train['TOP_PACK'].value_counts())
plt.figure(figsize=(10,5))
train['TOP_PACK'].value_counts(normalize=True).plot(kind='bar')
plt.ylabel('counts')
plt.xlabel('TOP_PACK')

In [None]:
#Check if the predictor class is balanced 
print(train['CHURN'].value_counts())
plt.figure(figsize=(10,5))
train['CHURN'].value_counts(normalize=True).plot(kind='bar')
plt.ylabel('counts')
plt.xlabel('Churn')

In [None]:
#Check summary of numerical fields
train.select_dtypes(include=['int64', 'float64']).describe().T

In [None]:
#Check for missing values in training data
train.isnull().sum()

In [None]:
#Check for missing values in test data
test.isnull().sum()

In [None]:
#We will drop REGION, TOP_PACK, and MRG
#We will also replace the missing values for the numerical columns with their means (averages)

In [None]:
train.drop(columns=['TOP_PACK'], inplace=True) #drop these columns

In [None]:
train.head()

In [None]:
test.drop(columns=['TOP_PACK'], inplace=True)

In [None]:
test.head()

In [None]:
#Fill NAs for train data

In [None]:
train['MONTANT'].fillna((train['MONTANT'].mean()), inplace=True)
train['FREQUENCE_RECH'].fillna((train['FREQUENCE_RECH'].mean()), inplace=True)
train['REVENUE'].fillna((train['REVENUE'].mean()), inplace=True)
train['ARPU_SEGMENT'].fillna((train['ARPU_SEGMENT'].mean()), inplace=True)
train['FREQUENCE'].fillna((train['FREQUENCE'].mean()), inplace=True)
train['DATA_VOLUME'].fillna((train['DATA_VOLUME'].mean()), inplace=True)
train['ON_NET'].fillna((train['ON_NET'].mean()), inplace=True)
train['ORANGE'].fillna((train['ORANGE'].mean()), inplace=True)
train['TIGO'].fillna((train['TIGO'].mean()), inplace=True)
train['ZONE1'].fillna((train['ZONE1'].mean()), inplace=True)
train['ZONE2'].fillna((train['ZONE2'].mean()), inplace=True)
train['FREQ_TOP_PACK'].fillna((train['FREQ_TOP_PACK'].mean()), inplace=True)

In [None]:
train.head()

In [None]:
train.isnull().sum()

In [None]:
#Fill NAs for test data

In [None]:
test['MONTANT'].fillna((test['MONTANT'].mean()), inplace=True)
test['FREQUENCE_RECH'].fillna((test['FREQUENCE_RECH'].mean()), inplace=True)
test['REVENUE'].fillna((test['REVENUE'].mean()), inplace=True)
test['ARPU_SEGMENT'].fillna((test['ARPU_SEGMENT'].mean()), inplace=True)
test['FREQUENCE'].fillna((test['FREQUENCE'].mean()), inplace=True)
test['DATA_VOLUME'].fillna((test['DATA_VOLUME'].mean()), inplace=True)
test['ON_NET'].fillna((test['ON_NET'].mean()), inplace=True)
test['ORANGE'].fillna((test['ORANGE'].mean()), inplace=True)
test['TIGO'].fillna((test['TIGO'].mean()), inplace=True)
test['ZONE1'].fillna((test['ZONE1'].mean()), inplace=True)
test['ZONE2'].fillna((test['ZONE2'].mean()), inplace=True)
test['FREQ_TOP_PACK'].fillna((test['FREQ_TOP_PACK'].mean()), inplace=True)

In [None]:
test.head()

In [None]:
test.isnull().sum()

## Machine Learning

In [None]:
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score,confusion_matrix,recall_score,precision_recall_curve, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [None]:
dropcols = ['user_id', 'CHURN', 'MRG', 'REGION']
y = train['CHURN']
x = train.drop(columns=dropcols, axis=1)
test = test.drop(columns=['user_id'], axis=1) #you will use this for predicting and submitting the resulting
print(x.shape)
print(y.shape)
print(test.shape)

In [None]:
#Split training data into train and test split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.5,random_state=5)
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

In [None]:
#Further split X_train and y_train into train and validation sets

In [None]:
X_train,X_val,y_train,y_val = train_test_split(X_train,y_train,test_size = 0.3, random_state=5)

In [None]:
print("train")
print(X_train.shape)
print(y_train.shape)
print("+"*7)
print("test")
print(X_test.shape)
print(y_test.shape)
print("+"*7)
print("validation")
print(X_val.shape)
print(y_val.shape)

In [None]:
#Standardize numeric columns

In [None]:
num_cols = ['MONTANT', 'FREQUENCE_RECH', 'REVENUE', 'ARPU_SEGMENT', 'FREQUENCE',
       'DATA_VOLUME', 'ON_NET', 'ORANGE', 'TIGO', 'ZONE1', 'ZONE2',
       'REGULARITY', 'FREQ_TOP_PACK']

In [None]:
scaler = StandardScaler()

In [None]:
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])

In [None]:
X_train.head()

In [None]:
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [None]:
X_test.head()

In [None]:
test[num_cols] = scaler.transform(test[num_cols])

In [None]:
test.head()

In [None]:
X_val[num_cols] = scaler.transform(X_val[num_cols])

In [None]:
X_val.head()

In [None]:
#Encode the TENURE column

In [None]:
encoder = LabelEncoder()
X_train["TENURE"] = encoder.fit_transform(X_train["TENURE"])

In [None]:
X_test["TENURE"] = encoder.transform(X_test["TENURE"])

In [None]:
X_val['TENURE'] = encoder.transform(X_val["TENURE"])

In [None]:
test['TENURE'] = encoder.transform(test["TENURE"])

In [None]:
X_train.head()

In [None]:
##RandomForestClassifier Model

In [None]:
#rand = RandomForestClassifier(bootstrap=True,criterion = "gini", n_jobs=-1, max_depth=7, n_estimators=200, random_state=1, verbose=True)
inputs = tf.keras.Input(shape=(14)) #15
x1 = tf.keras.layers.Dense(160, activation='relu')(inputs)
x2 = tf.keras.layers.Dense(160, activation='relu')(x1)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x2)

model = tf.keras.Model(inputs=inputs, outputs=outputs)


model.compile(optimizer='adam',loss='binary_crossentropy',metrics=[tf.keras.metrics.AUC(name='auc')])


batch_size = 300
epochs = 10



#Fit model on data
#randmodel = rand.fit(X_train,y_train)

history = model.fit(X_train, y_train,validation_split=0.2,batch_size=batch_size,epochs=epochs, verbose=1)

In [None]:
#Predict on the X_test data 
#randpred = randmodel.predict(X_test)

model.evaluate(X_test, y_test)

In [None]:
#print("Acuracy")
#accuracy_score(y_test, randpred)

In [None]:
#print("Recall")
#recall_score(y_test, randpred)

In [None]:
#print("F1 Score")
#f1_score(y_test, randpred)

In [None]:
#confusion_matrix(y_test, randpred)

In [None]:
# Making a submission

In [None]:
submission.head()

In [None]:
test.head()

In [None]:
subpred = model.predict(test)

In [None]:
subpred

In [None]:
submission["CHURN"] = subpred

In [None]:
submission.head()

In [None]:
#submission.to_csv('submission_A.csv', index=False)

In [None]:
# 1. Do more feature engineering
# 2. Handle the imbalance nature of the predictor class 
# 3. Use other algorithms
# 4. Tune hyperparameters of this model
# 5. Handle missing values properly
# 6. Any other thing you feel can improve the performance of the model is good to go


### Good Luck !!!