# Telecommunication Churn Prediction Model
## Importing all the important libraries

In [None]:
import pandas as pd 
import numpy as np      
import seaborn as sns                 
import matplotlib.pyplot as plt       
%matplotlib inline 
#pip install plotly
import plotly.express as px
plt.rcParams.update(plt.rcParamsDefault)

import warnings  
warnings.filterwarnings("ignore")

## loading the dataset

In [None]:
data=pd.read_csv("data_summer_course.csv")
pd.set_option('display.max_columns', None)
data.head(3)

In [None]:
data.shape

In [None]:
data.columns.values

In [None]:
data.dtypes

In [None]:
data.isnull().sum()

In [None]:
data.describe()

## Data Preprocessing
### Handling the missing values and drop the columns

In [None]:
# How many columns have most Null values in percentage.  
percent_of_null_values = (data.isnull().sum()/ data.shape[0]) *100
print(percent_of_null_values)

In [None]:
correlation_matrix = data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix Heatmap")
plt.show()

In [None]:
data['SUMDATAUSG4GCD1_30'] = data['SUMDATAUSG4GCD1_15'] + data['SUMDATAUSG4GCD15_30']

column1 = 'SUMDATAUSG4GCD1_30'
column2 =  'SUMDATAUSGCD1_30'
correlation1 = data[column1].corr(data[column2])
print(correlation1)

In [None]:
column_name = ['SUMDATAUSG4GCD1_30', 'SUMDATAUSG4GCD1_15' , 'SUMDATAUSG4GCD15_30']
data.drop(columns=column_name, inplace=True)

In [None]:
column1 = 'MAINACTBAL1'
column2 = 'MAINACTBAL'
column3 =  'SUBSRIBERLASTBALANCE'
correlation1 = data[column1].corr(data[column3])
# Computed the correlation between the two columns 
correlation2 = data[column1].corr(data[column2])
print(correlation1, correlation2)

In [None]:
columns_name = ['MAINACTBAL1','SUBSRIBERLASTBALANCE']
data.drop(columns=columns_name, inplace=True)
data.info()

In [None]:
missing_numerical_var = ['SUMVCEREVCD1_30', 'SUMDATAREVCD1_30','SUMDATAUSGCD1_30', 'SUMVOICEUSGCD1_30']
plt.figure(figsize=(10,10))
sns.set()

for i, var in enumerate(missing_numerical_var):
    plt.subplot(2,2,i+1)
    sns.distplot(data[var], bins=50, kde_kws={'linewidth':2, 'color':'#DC143C'})
plt.show()    

In [None]:
# Numerical column
# Filling the null values or empty cell with mean value
copy_data_mean = data.copy();
copy_data_median = data.copy();

pp = data[missing_numerical_var].describe()
print(pp)
# In Null values we are entering the mean value.
for column in missing_numerical_var:
    copy_data_mean[column] = copy_data_mean[column].fillna(copy_data_mean[column].mean())
    
#In null values we are filling the median values
for column in missing_numerical_var:
    copy_data_median[column] = copy_data_median[column].fillna(copy_data_median[column].median())

In [None]:
'''plt.figure(figsize=(10,10))
sns.get()
for i, var in enumerate(missing_numerical_var):
    plt.subplot(2,2,i+1)
    sns.distplot(data[var], bins=50, kde_kws={'linewidth':2, 'color':'red'}, label = 'original')
    sns.distplot(copy_data_mean[var], bins=50, kde_kws={'linewidth':2, 'color':'green'}, label ='mean')
    sns.distplot(copy_data_median[var], bins=50, kde_kws={'linewidth':2, 'color':'purple'}, label = 'median')
    plt.legend()  
plt.show()   ''' 

In [None]:
data = copy_data_median.copy()

In [None]:
#columns has been stored in this DataFrame 'deleted_columns' for later use if needed
dup_data = data.copy()
columns_to_extract = ['DEVICEDUALSIMFLAG', 'DEVICEMODELC', 'DEVICENETWORK','last_app_used','FIRSTCALLDATE', 'Mobile_Number', 'FIRSTREVGENEVTDATE']
deleted_columns = data[columns_to_extract].copy()
data.drop(columns=columns_to_extract, inplace=True)

In [None]:

data['LASTVCEUSGDATE'] = pd.to_datetime(data['LASTVCEUSGDATE'])  
mode_date = data['LASTVCEUSGDATE'].mode()[0] 
data['LASTVCEUSGDATE'].fillna(mode_date, inplace=True)

data['LASTRECHRGDATE'] = pd.to_datetime(data['LASTRECHRGDATE']) 
mode_date = data['LASTRECHRGDATE'].mode()[0] 
data['LASTRECHRGDATE'].fillna(mode_date, inplace=True)

data['LASTREVGENEVTDATE'] = pd.to_datetime(data['LASTREVGENEVTDATE'])

In [None]:
data['HANDSETCHANGESFLAGD1_30'].value_counts()

In [None]:
column_name = 'HANDSETCHANGESFLAGD1_30'

# Fill the missing (NaN) values in the column with 0
data[column_name].fillna(0, inplace=True)

In [None]:
data['CHURN_PREDICTION'] = data['CNTCHURND1_30'].apply(lambda x: 1 if pd.notnull(x) else 0)

In [None]:
# churn has been stored in this series for later use: churn_deleted_col
churn_deleted_col = data['CNTCHURND1_30']
data['CNTCHURND1_30'].fillna(0, inplace=True)

In [None]:
data.rename(columns={'CNTCHURND1_30': 'CNTCHURN_FLAG'}, inplace=True)

In [None]:
data.info()

In [None]:
correlation_matrix = data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix Heatmap")
plt.show()

In [None]:
data['CNTCHURN_FLAG'].isnull().sum()

In [None]:
fig = plt.figure(figsize=(12, 6)) 
ax = fig.add_subplot(111)
prop_response = data['CNTCHURN_FLAG'].value_counts(normalize=True)

# create a bar plot showing the proportion of churning customers in next 30 days

prop_response.plot(kind='bar',ax=ax, color=['darkorange'])
ax.set_title('Proportion of observations of the response variable',fontsize=18, loc='left')
ax.set_xlabel('Churning days ',fontsize=14)
ax.set_ylabel('proportion of observations of churn in next 30 days',fontsize=14)
ax.tick_params(rotation='auto')

# eliminate the frame from the plot

s_names = ('top', 'right', 'bottom', 'left')
for spine_name in s_names:
    ax.spines[spine_name].set_visible(False)
plt.show()    

In [None]:
pie_data = data['CHURN_PREDICTION'].value_counts()
pie_data.plot(kind='pie', autopct='%1.1f%%', startangle=90)

# Add title and label for better visualization
plt.title('Pie Chart of COLUMN_NAME')
plt.ylabel('') 
# Show the plot
plt.show()


In [None]:
pie_data = data['CNTCHURN_FLAG'].value_counts()
pie_data.plot(kind='pie', autopct='%1.1f%%', startangle=90)

# Add title and label for better visualization
plt.title('Pie Chart of COLUMN_NAME')
plt.ylabel('') 
# Show the plot
plt.show()


In [None]:
df = data.copy()
df.info()

# Feature Engineering

In [None]:
# 1. Date Features

date_columns = ['LASTVCEUSGDATE', 'LASTRECHRGDATE', 'LASTREVGENEVTDATE']

# 2. Age on Network
def age_group(age):
    if age <= 180:  # 0-6 months
        return '0-6 months'
    elif age <= 365:  # 6-12 months
        return '6-12 months'
    elif age <= 730:  # 1-2 years
        return '1-2 years'
    else:
        return '2+ years'

df['AGEONNETWORK_group'] = df['AGEONNETWORK'].apply(age_group)

# 3. Usage Ratio
df['USAGE_RATIO'] = df['SUMDATAREVCD1_30'] / df['SUMVCEREVCD1_30']

# 4. Total Revenue
df['TOTAL_REVENUE'] = df['SUMVCEREVCD1_30'] + df['SUMDATAREVCD1_30'] + df['ARPUD1_30']

# 5. Interaction Features
df['DATA_USAGE_IMPACT'] = df['SUMDATAREVCD1_30'] * df['SUMDATAUSGCD1_30']

# 6. Time Since Last Event
current_date = pd.to_datetime('today')

for column in date_columns:
    df[column + '_since'] = (current_date - df[column]).dt.days

# 7. Usage Percentiles
usage_columns = ['SUMDATAREVCD1_30', 'SUMDATAUSGCD1_30', 'SUMVCEREVCD1_30', 'SUMVOICEUSGCD1_30']

for column in usage_columns:
    df[column + '_percentile'] = df[column].rank(pct=True)

# Now, 'df' contains the dataset with the new engineered features.

# EDA

## Univariate Analysis

In [None]:
df.info()

In [None]:
df.columns

In [None]:
numeric_columns = ['MAINACTBAL', 'ARPUD1_30', 'SUMVCEREVCD1_30', 'SUMDATAREVCD1_30',
                   'SUMDATAUSGCD1_30', 'SUMVOICEUSGCD1_30','TOTAL_REVENUE',
                   'DATA_USAGE_IMPACT','LASTVCEUSGDATE_since', 'LASTRECHRGDATE_since','LASTREVGENEVTDATE_since', 'SUMDATAREVCD1_30_percentile',
                   'SUMDATAUSGCD1_30_percentile', 'SUMVCEREVCD1_30_percentile','SUMVOICEUSGCD1_30_percentile']
categorical_columns = ['DEVICETYPE', 'ISDEVICE3GENABLED', 'VASSUBSCRIBERFLAG', 'ISDEVICEDATAENABLED',
                        'AGEONNETWORK_group','SMARTPHONEFLAG','HANDSETCHANGESFLAGD1_30','CUSTSEGMENT']

In [None]:
for column in numeric_columns:
    plt.figure(figsize=(8, 5))
    sns.histplot(df[column], bins=20, kde=True)
    plt.title(f'Distribution of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Step 3: Univariate Analysis - Categorical Features
'''for i, column in enumerate(categorical_columns, 1):
    plt.figure(figsize=(5, 3))
    sns.countplot(df[column])
    plt.title(column)

plt.tight_layout()
plt.show()'''

## Bivariate Analysis

In [None]:
# Step 4: Bivariate Analysis - Churn vs. Numeric Features
for i, column in enumerate(numeric_columns, 1):
    plt.figure(figsize=(8, 5))
    sns.boxplot(x='CNTCHURN_FLAG', y=column, data=df)
    plt.title(f'Churn vs. {column}')

plt.tight_layout()
plt.show()



In [None]:
# Step 5: Bivariate Analysis - Churn vs. Categorical Features
for i, column in enumerate(categorical_columns, 1):
    plt.figure(figsize=(8, 5))
    sns.countplot(x=column, hue='CNTCHURN_FLAG', data=df)
    plt.title(f'Churn vs. {column}')

plt.tight_layout()
plt.show()


In [None]:
for column in categorical_columns:
    plt.figure(figsize=(10, 5))
    sns.countplot(data=df, x=column, hue='CHURN_PREDICTION')
    plt.title(f'Distribution of {column} by Churn')
    plt.xlabel(column)
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.legend(title='Churn Prediction', labels=['Non-Churn', 'Churn'])
    plt.show()


In [None]:
plt.figure(figsize=(15, 8))
sns.boxplot(data=df[numeric_columns])
plt.title('Boxplot of Numerical Features')
plt.xticks(rotation=45)
plt.show()


In [None]:

# Step 6: Correlation Matrix
correlation_matrix = df.corr()
plt.figure(figsize=(20, 15))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
'''sns.pairplot(df[numeric_columns])
plt.suptitle('Pair Plot of Numerical Features')
plt.tight_layout()
plt.show()'''

In [None]:
df.drop(columns=date_columns,inplace=True)

In [None]:
print(df)

In [None]:
df.info()

In [None]:
df.to_csv('new_data.csv', index=False)

# Changing the object types to numerical

In [None]:
final_copy=df.copy()
final_copy.drop(columns=['SUMDATAREVCD1_30_percentile','SUMDATAUSGCD1_30_percentile','SUMVCEREVCD1_30_percentile','SUMVOICEUSGCD1_30_percentile'],inplace=True)
# Changing the column containing Y & N to 0 & 1


In [None]:
columns_having_yn=['ISDEVICE3GENABLED','VASSUBSCRIBERFLAG','ISDEVICEDATAENABLED','SMARTPHONEFLAG']
mapping = {'Y': 1, 'N': 0}

# Loop through each column and use the map function to convert the values
for col in columns_having_yn:
    final_copy[col] = final_copy[col].map(mapping)

In [None]:
final_copy.drop(columns='USAGE_RATIO',inplace=True)

In [None]:
# changing AGEONNETWORK_group to '0-6 months'->1, '6-12 months'->2,   '1-2 years'->3,
#'1-2 years'->4, '2+ years'->5
final_copy.drop(columns='AGEONNETWORK',inplace=True)

In [None]:
mapping={'0-6 months':1, '6-12 months':2, '1-2 years':3, '2+ years':4}
final_copy['AGEONNETWORK_group']=final_copy['AGEONNETWORK_group'].map(mapping)

In [None]:
mapping={'None':0, 'Basic':1,   'Silver':2,'Gold':3, 'Platinum':4,'Signature':5}
final_copy['CUSTSEGMENT']=final_copy['CUSTSEGMENT'].map(mapping)

In [None]:
mapping={'Feature +':2,'Smartphone':1,'Voice Centric':0}
final_copy['DEVICETYPE']=final_copy['DEVICETYPE'].map(mapping)

In [None]:
final_copy.info()

In [None]:
df_x=final_copy.copy()

In [None]:
X = df_x.drop(columns=['CNTCHURN_FLAG','CHURN_PREDICTION'])

# select dependent variables
y1 = df_x.loc[:,'CHURN_PREDICTION']
y2 = df_x.loc[:,'CNTCHURN_FLAG']

# prove that the variables were selected correctly
print(X.columns)

# prove that the variables were selected correctly
print(y1.name)
print(y2.name)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.25, random_state=0, shuffle=True)

# Trying Multiple Models
### Logistic Regression
### Random Forest
### Gradiente Boosting

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
def create_models(seed=1):
    models = []
    models.append(('logistic_regression', LogisticRegression(random_state=seed)))
    models.append(('random_forest', RandomForestClassifier(random_state=seed)))
    models.append(('gradient_boosting', GradientBoostingClassifier(random_state=seed)))    
    return models
models = create_models()

## Testing the result of each model 

In [None]:
from sklearn.metrics import accuracy_score
results = []
names = []
#scoring = 'accuracy'
for name, model in models:
    # fit the model with the training data
    model.fit(X_train, y_train).predict(X_test)
    # make predictions with the testing data
    predictions = model.predict(X_test)
    # calculate accuracy 
    accuracy = accuracy_score(y_test, predictions)
    # append the model name and the accuracy to the lists
    results.append(accuracy)
    names.append(name)
    # print classifier accuracy
    print('Classifier: {}, Accuracy: {})'.format(name, accuracy))