In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import modulos


In [47]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,LA,117,408,No,No,0,184.5,97,31.37,351.6,80,29.89,215.8,90,9.71,8.7,4,2.35,1,FALSO
1,IN,65,415,No,No,0,129.1,137,21.95,228.5,83,19.42,208.8,111,9.4,12.7,6,3.43,4,VERDADERO
2,NY,161,415,No,No,0,332.9,67,56.59,317.8,97,27.01,160.6,128,7.23,5.4,9,1.46,4,VERDADERO
3,SC,111,415,No,No,0,110.4,103,18.77,137.3,102,11.67,189.6,105,8.53,7.7,6,2.08,2,FALSO
4,HI,49,510,No,No,0,119.3,117,20.28,215.1,109,18.28,178.7,90,8.04,11.1,1,3.0,1,FALSO


In [48]:

categoric_vars, discrete_vars , continues_vars = modulos.getColumnsDataTypes(df=df)

## 2) Feature Engineering

### 2.1) Configuración de Variables:

In [50]:
y = 'Churn'    
continues_vars.remove('Number vmail messages')
categoric_vars.remove('State')
categoric_vars.remove(y)

### 2.2) Imputación de Variables

##### 2.2.1) Variables Continuas

In [51]:
df[continues_vars].isnull().mean()  

Account length         0.0
Total day minutes      0.0
Total day calls        0.0
Total day charge       0.0
Total eve minutes      0.0
Total eve calls        0.0
Total eve charge       0.0
Total night minutes    0.0
Total night calls      0.0
Total night charge     0.0
Total intl minutes     0.0
Total intl charge      0.0
dtype: float64

##### 2.2.2) Variables Discretas

In [52]:
df[discrete_vars].isnull().mean()

Area code                 0.0
Total intl calls          0.0
Customer service calls    0.0
dtype: float64

##### 2.2.3) Variables Categoricas

In [53]:
df[categoric_vars].isnull().mean()

International plan    0.0
Voice mail plan       0.0
dtype: float64

### 2.3) Codificación de Variables Categóricas

In [54]:
international_map = df['International plan'].value_counts().to_dict()  
international_map

{'No': 3010, 'Yes': 323}

In [55]:
voice_mail_map = df['Voice mail plan'].value_counts().to_dict()  
voice_mail_map

{'No': 2411, 'Yes': 922}

In [56]:
df['International plan'] = df['International plan'].map(international_map) 
df['Voice mail plan'] = df['Voice mail plan'].map(voice_mail_map)  


Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,LA,117,408,3010,2411,0,184.5,97,31.37,351.6,80,29.89,215.8,90,9.71,8.7,4,2.35,1,FALSO
1,IN,65,415,3010,2411,0,129.1,137,21.95,228.5,83,19.42,208.8,111,9.4,12.7,6,3.43,4,VERDADERO
2,NY,161,415,3010,2411,0,332.9,67,56.59,317.8,97,27.01,160.6,128,7.23,5.4,9,1.46,4,VERDADERO
3,SC,111,415,3010,2411,0,110.4,103,18.77,137.3,102,11.67,189.6,105,8.53,7.7,6,2.08,2,FALSO
4,HI,49,510,3010,2411,0,119.3,117,20.28,215.1,109,18.28,178.7,90,8.04,11.1,1,3.0,1,FALSO


In [57]:
from sklearn.pipeline import Pipeline  #solo hace transformacion de categoricas
import preprocessors as pp  #ver archivo con este nombre

def instanciatePipeline(df, y):
    categoric_vars, discrete_vars , continues_vars = modulos.getColumnsDataTypes(df=df)
    
    categoric_vars.remove(y)
    bankChurner_Pipeline = Pipeline(steps=[
        ('categorical-encoder',
            pp.categoricalEncoderOperator(varNames=categoric_vars)),


   ])

    return bankChurner_Pipeline

#### Particion de data

In [58]:
from sklearn.model_selection import train_test_split

df = pd.read_csv("data.csv")

X = df.drop(['Churn', 'Number vmail messages'], axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=2022)

dfSalida = instanciatePipeline(df, 'Churn').fit_transform(X_train, y_train)

dfSalida['Churn'] = pd.get_dummies(y, drop_first=True)




In [59]:
dfSalida

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
617,47,107,408,2116,659,96.3,83,16.37,179.6,91,15.27,166.3,121,7.48,10.3,2,2.78,1,0
1425,55,91,510,2116,659,232.4,97,39.51,186.0,88,15.81,190.5,128,8.57,12.3,3,3.32,3,0
215,43,131,408,2116,1674,94.4,80,16.05,215.1,101,18.28,179.7,108,8.09,13.1,9,3.54,2,0
1113,47,74,408,2116,1674,174.1,96,29.60,251.1,94,21.34,257.6,123,11.59,8.3,5,2.24,2,1
2364,51,125,408,2116,1674,106.1,95,18.04,157.6,113,13.40,192.5,69,8.66,8.1,3,2.19,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1713,47,185,408,2116,659,151.1,121,25.69,244.7,88,20.80,154.4,91,6.95,13.8,2,3.73,2,0
624,43,27,415,2116,1674,227.4,67,38.66,248.0,115,21.08,61.4,109,2.76,7.8,6,2.11,1,0
173,45,153,408,2116,1674,185.3,127,31.50,208.0,73,17.68,206.1,124,9.27,15.1,3,4.08,1,0
1244,53,68,415,2116,1674,162.1,86,27.56,155.0,86,13.18,189.7,87,8.54,11.0,9,2.97,5,1


In [60]:
dfSalida.to_csv("FE_DF_Salida.csv")