<a href="https://colab.research.google.com/github/Steven-Macas/Red-Neuronal/blob/main/ejercicio_churn_red_neuronal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#dataset
#https://www.kaggle.com/datasets/blastchar/telco-customer-churn

In [1]:
# first neural network with keras tutorial
from numpy import loadtxt
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Preparación de los datos

In [57]:
data = pd.read_csv('Churn.csv')
print(len(data))
data.head().T

7043


Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [58]:
data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [59]:
total_charges = pd.to_numeric(data.TotalCharges, errors='coerce')
data.TotalCharges = pd.to_numeric(data.TotalCharges, errors='coerce')
data.TotalCharges = data.TotalCharges.fillna(0)
data[total_charges.isnull()][['customerID', 'TotalCharges']]

Unnamed: 0,customerID,TotalCharges
488,4472-LVYGI,0.0
753,3115-CZMZD,0.0
936,5709-LVOEQ,0.0
1082,4367-NUYAO,0.0
1340,1371-DWPAZ,0.0
3331,7644-OMVMY,0.0
3826,3213-VVOLG,0.0
4380,2520-SGTTA,0.0
5218,2923-ARZLG,0.0
6670,4075-WKNIU,0.0


In [60]:
replacer = lambda str: str.lower().str.replace(' ', '_')
data.columns = replacer(data.columns.str)
for col in list(data.dtypes[data.dtypes == 'object'].index):
    data[col] = replacer(data[col].str)
data.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [61]:
data.churn = (data.churn == 'yes').astype(int)
data.churn.head()

0    0
1    0
2    1
3    0
4    1
Name: churn, dtype: int64

In [62]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
                'phoneservice', 'multiplelines', 'internetservice',
                'onlinesecurity', 'onlinebackup', 'deviceprotection',
                'techsupport', 'streamingtv', 'streamingmovies',
                'contract', 'paperlessbilling', 'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']
data[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

In [63]:
data.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                 int64
dtype: object

In [64]:
from sklearn.model_selection import train_test_split
data_train_full, data_test = train_test_split(data, test_size=0.2, random_state=1)

data_train, data_val = train_test_split(data_train_full, test_size=0.33, random_state=1)
y_train = data_train.churn.values
y_val = data_val.churn.values

del data_train['churn']
del data_val['churn']

data_train.head().T

Unnamed: 0,4204,7034,5146,5184,1310
customerid,4395-pzmsn,0639-tsiqw,3797-fkogq,7570-welny,6393-wryze
gender,male,female,male,female,female
seniorcitizen,1,0,0,0,0
partner,no,no,no,yes,yes
dependents,no,no,yes,no,no
tenure,5,67,11,68,34
phoneservice,yes,yes,yes,yes,yes
multiplelines,no,yes,yes,yes,yes
internetservice,fiber_optic,fiber_optic,fiber_optic,fiber_optic,fiber_optic
onlinesecurity,no,yes,no,yes,no


# Análisis de la importancia de las propiedades

In [65]:
global_mean = data_train_full.churn.mean()
round(global_mean, 3)

0.27

In [66]:
female_mean = data_train_full[data_train_full.gender == 'female'].churn.mean()
print(round(female_mean, 3))
male_mean = data_train_full[data_train_full.gender == 'male'].churn.mean()
print(round(male_mean, 3))

0.277
0.263


In [67]:
partner_yes = data_train_full[data_train_full.partner == 'yes'].churn.mean()
print(round(partner_yes, 3))
partner_no = data_train_full[data_train_full.partner == 'no'].churn.mean()
print(round(partner_no, 3))

0.205
0.33


In [68]:
from sklearn.metrics import mutual_info_score

calculate_mi = lambda col: mutual_info_score(col, data_train_full.churn)

data_mi = data_train_full[categorical].apply(calculate_mi)
data_mi = data_mi.sort_values(ascending=False).to_frame(name='MI')
data_mi

Unnamed: 0,MI
contract,0.09832
onlinesecurity,0.063085
techsupport,0.061032
internetservice,0.055868
onlinebackup,0.046923
deviceprotection,0.043453
paymentmethod,0.04321
streamingtv,0.031853
streamingmovies,0.031581
paperlessbilling,0.017589


In [69]:
print(data_train_full[numerical].corrwith(data_train_full.churn))

print(round(data_train_full[data_train_full.tenure <= 2].churn.mean(), 3))
print(round(data_train_full[(data_train_full.tenure > 3) & 
                          (data_train_full.tenure <= 12)].churn.mean(), 3))
print(round(data_train_full[data_train_full.tenure > 12].churn.mean(), 3))

print(round(data_train_full[data_train_full.monthlycharges < 20].churn.mean(), 3))
print(round(data_train_full[(data_train_full.monthlycharges > 21) & 
                          (data_train_full.monthlycharges <= 50)].churn.mean(), 3))
print(round(data_train_full[data_train_full.monthlycharges > 50].churn.mean(), 3))

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64
0.595
0.391
0.176
0.088
0.223
0.325


# Ingeniería de propiedades

In [70]:
train_dict = data_train[categorical + numerical].to_dict(orient='records')
dict(sorted(train_dict[0].items()))

{'contract': 'month-to-month',
 'dependents': 'no',
 'deviceprotection': 'no',
 'gender': 'male',
 'internetservice': 'fiber_optic',
 'monthlycharges': 85.55,
 'multiplelines': 'no',
 'onlinebackup': 'yes',
 'onlinesecurity': 'no',
 'paperlessbilling': 'yes',
 'partner': 'no',
 'paymentmethod': 'electronic_check',
 'phoneservice': 'yes',
 'seniorcitizen': 1,
 'streamingmovies': 'yes',
 'streamingtv': 'no',
 'techsupport': 'no',
 'tenure': 5,
 'totalcharges': 408.5}

In [71]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

DictVectorizer(sparse=False)

In [72]:
X_train = dv.transform(train_dict)
X_train[0]

array([  1.  ,   0.  ,   0.  ,   1.  ,   0.  ,   1.  ,   0.  ,   0.  ,
         0.  ,   1.  ,   0.  ,   1.  ,   0.  ,  85.55,   1.  ,   0.  ,
         0.  ,   0.  ,   0.  ,   1.  ,   1.  ,   0.  ,   0.  ,   0.  ,
         1.  ,   1.  ,   0.  ,   0.  ,   0.  ,   1.  ,   0.  ,   0.  ,
         1.  ,   1.  ,   0.  ,   0.  ,   1.  ,   1.  ,   0.  ,   0.  ,
         1.  ,   0.  ,   0.  ,   5.  , 408.5 ])

In [73]:
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'gender=female', 'gender=male',
       'internetservice=dsl', 'internetservice=fiber_optic',
       'internetservice=no', 'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',

# Entrenamiento del modelo

In [74]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

LogisticRegression(solver='liblinear')

In [75]:
val_dict = data_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)
y_pred = model.predict_proba(X_val)
y_pred

array([[0.99142714, 0.00857286],
       [0.79028712, 0.20971288],
       [0.78364454, 0.21635546],
       ...,
       [0.35664296, 0.64335704],
       [0.81055916, 0.18944084],
       [0.87261929, 0.12738071]])

In [76]:
y_pred = model.predict_proba(X_val)[:, 1]
y_pred

array([0.00857286, 0.20971288, 0.21635546, ..., 0.64335704, 0.18944084,
       0.12738071])

In [77]:
churn = y_pred >= 0.5
churn

array([False, False, False, ...,  True, False, False])

In [78]:
# precisión del modelo con los datos de entrenamiento
round((y_val == churn).mean(), 3) 

0.805

# Definimos el Modelo Keras

In [79]:
model = Sequential()
model.add(Dense(45, input_shape=(45,), activation='relu'))
model.add(Dense(45, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [80]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])

In [81]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 45)                2070      
                                                                 
 dense_13 (Dense)            (None, 45)                2070      
                                                                 
 dense_14 (Dense)            (None, 1)                 46        
                                                                 
Total params: 4,186
Trainable params: 4,186
Non-trainable params: 0
_________________________________________________________________


# Entrenamos el modelo de KERAS

In [82]:
X_train = X_train.astype('float32')

In [83]:
# fit the keras model on the dataset
model.fit(X_train, y_train, epochs=500, batch_size=300)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.callbacks.History at 0x7f531ed9a190>

# Evaluar Keras

In [84]:
# evaluate the keras model
_, accuracy = model.evaluate(X_train, y_train)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 81.72


In [85]:
data.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,0
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,0
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,1
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,...,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,0
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.7,151.65,1


# Crear Red Neuronal con dos salidas

In [86]:
data2 = data

In [87]:
churn2 = pd.get_dummies(data2['churn'],prefix='churn')
data2 = pd.concat([data2,churn2],axis = 1)
data2.drop('churn', axis=1, inplace=True)

In [88]:
data2.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn_0,churn_1
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,1,0
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,no,no,no,one_year,no,mailed_check,56.95,1889.5,1,0
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,0,1
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,...,yes,no,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,1,0
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,...,no,no,no,month-to-month,yes,electronic_check,70.7,151.65,0,1


In [110]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

In [120]:
df_train_full, df_test = train_test_split(data2, test_size=0.2, random_state=1)

df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=1)
y_train = df_train
train_dict = data2[categorical + numerical].to_dict(orient='records')
dict(sorted(train_dict[0].items()))

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

X_train = dv.transform(train_dict)
X_train[0]

dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'gender=female', 'gender=male',
       'internetservice=dsl', 'internetservice=fiber_optic',
       'internetservice=no', 'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',

In [96]:
model2 = Sequential()
model2.add(Dense(45, input_shape=(45,), activation='relu'))
model2.add(Dense(45, activation='relu'))
model2.add(Dense(2, activation='sigmoid'))

In [97]:
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])

In [116]:
# Separa las variables de entrada (X) de las variables de salida (y)
X = data2.drop(['churn_0', 'churn_1'], axis=1)
y = data2[['churn_0', 'churn_1']]

In [117]:
# Divide los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)

In [124]:
# fit the keras model on the dataset
model2.fit(X_train, y_train, epochs=500, batch_size=300)

ValueError: ignored

In [126]:
X_train

array([[1.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        2.9850e+01],
       [0.0000e+00, 1.0000e+00, 0.0000e+00, ..., 0.0000e+00, 3.4000e+01,
        1.8895e+03],
       [1.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.0000e+00,
        1.0815e+02],
       ...,
       [1.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.1000e+01,
        3.4645e+02],
       [1.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.0000e+00,
        3.0660e+02],
       [0.0000e+00, 0.0000e+00, 1.0000e+00, ..., 1.0000e+00, 6.6000e+01,
        6.8445e+03]])

In [None]:
# evaluate the keras model
_, accuracy = model.evaluate(X_train, y_train)
print('Accuracy: %.2f' % (accuracy*100))