In [1849]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso
from sklearn.metrics import accuracy_score, classification_report

In [1712]:
dfChurn = pd.read_csv("../data/raw/Churn.csv")
dfCalls = pd.read_csv("../data/raw/megaline_calls.csv")
dfInternet = pd.read_csv("../data/raw/megaline_internet.csv")
dfMessages = pd.read_csv("../data/raw/megaline_messages.csv")
dfPlans = pd.read_csv("../data/raw/megaline_plans.csv")
dfUsers = pd.read_csv("../data/raw/megaline_users.csv")

In [1713]:
#funcion para obtener las metricas que yo quiero
def agg_std(df): 
    agg_functions = ['mean', 'std', 'min', 'max', 'var', 'median']
    
    agg_stats = {}
    
    for column in df.columns:
        column_stats = df[column].agg(agg_functions).to_dict()
        
        # Skewness and Kurtosis
        column_stats['skew'] = df[column].skew()
        column_stats['kurt'] = df[column].kurt()
        
        # Mode Handling
        mode_values = df[column].mode()
        column_stats['mode'] = mode_values.iloc[0] if not mode_values.empty else None
        
        # Quantiles
        quantiles = df[column].quantile([0.25, 0.5, 0.75]).to_dict()
        column_stats.update({f'quantile_{int(q*100)}': v for q, v in quantiles.items()})
        
        agg_stats[column] = column_stats
    
    agg_stats_df = pd.DataFrame(agg_stats)
    
    return agg_stats_df


In [1714]:
#Revisar si es que todos los tipos de los datos en cada columna son iguales
def check_type(df):
    for column in df.columns:
        same_type = df[column].apply(type).nunique() == 1
        print(f"All elements in the column '{column}' have the same data type: {same_type}")

In [1715]:
#Revisar si hay duplicados en cada columna del df
def check_duplicate(df):
    for column in df.columns:
        duplicate_count = df[column].duplicated().sum()
        print(f"There are {duplicate_count} duplicates in the {column}")

In [1716]:
#Crear un histogrma mas rapido
def hist(df,i,binz,title,xlabel,ylabel):
    plt.hist(df.iloc[:,i], bins=binz, edgecolor='black')

    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)

    plt.show()

In [1717]:
#Funciones para crear gráficos más fácil
def box(df, column, title, ylabel):
    plt.figure(figsize=(6, 4))
    sns.boxplot(y=df.iloc[:, column], color="lightblue", width=0.5, fliersize=3)
    plt.title(title)
    plt.ylabel(ylabel)
    plt.grid(True, linestyle="--", alpha=0.7)
    plt.show()


In [1718]:
#Crear un violin plot mas rapido
def violin(df,i,title,ylabel):
    plt.figure(figsize=(6, 4))
    sns.violinplot(data=df.iloc[:,i], inner="box", color="lightblue")  # Ajustar suavidad
    plt.title(title)
    plt.ylabel(ylabel)
    plt.grid(True)
    plt.ylim(df.iloc[:,i].min(), df.iloc[:,i].max())
    plt.show()

In [1719]:
#una funcion donde se grafican dos lineas para comparar de una manera mas evidente los histogramas. Usa porcentajes
def dob(df, i, df2, i2, n, title, xlabel, ylabel,dia):
    freq1 = df.iloc[:, i].value_counts(normalize=True).sort_index() * 100
    freq2 = df2.iloc[:, i2].value_counts(normalize=True).sort_index() * 100
    
    plt.figure(figsize=(8, 5))

    plt.plot(freq1.index, freq1.values, marker='o', linestyle='-', color='#A19FA9', label="Todos los Días")

    plt.plot(freq2.index, freq2.values, marker='o', linestyle='-', color='lightblue', label=f"{dia}")

    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend()
    
    plt.grid(True, linestyle="--", alpha=0.6)
    
    plt.show()


In [1720]:
#funcion ipara comparar lineas exclusivamente para una donde se usa usuario
def dob_user(df, i, df2, i2, n, title, xlabel, ylabel):
    freq1 = df.iloc[:, i].value_counts().sort_index()
    freq2 = df2.iloc[:, i2].value_counts().sort_index()

    
    plt.figure(figsize=(8, 5))

    plt.plot(freq1.index, freq1.values, marker='o', linestyle='-', color='#A19FA9', label="Con order_number")

    plt.plot(freq2.index, freq2.values, marker='o', linestyle='-', color='lightblue', label="Sin order_number")

    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend()
    
    plt.grid(True, linestyle="--", alpha=0.6)
    
    plt.show()


In [1721]:
#funcion para crear un diccionario que contiene de manera ordenada un df y el dia de la semana, empezando por 
# domingo
def seg(df, i, r):
    dfs = {}
    for value in range(r):
        dfs[f"df_{df.columns[i]}_{value}"] = df[df.iloc[:, i] == value]
        
    return dfs

In [1722]:
#grafico de barras para usar con los dias de la semana ya que aqui colocamos el label dependiendo del valor
def bar_dow(df, i, title, xlabel, ylabel, coloreo=(None, None)):

    freq = df.iloc[:,i].value_counts()
    
    plt.bar(freq.index, freq.values, edgecolor='black')
    
    start, end = coloreo

    colors = ['#E4F5FD' if start <= idx <= end else '#A19FA9' for idx in range(len(list(freq.index)))]
    
    plt.bar(freq.index, freq.values, color=colors, edgecolor='black')
    
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    
    plt.xticks(ticks=freq.index, labels=['Domingo', 'Lunes', 'Martes', 'Miércoles', 'Jueves', 'Viernes', 'Sábado'],rotation=45)
    
    plt.show()

In [1723]:
#funcion para obtener el color de la zona resaltada
def get_color(value,start,end):
    if start is not None and end is not None:
        return '#E4F5FD' if start <= value <= end else '#A19FA9'
    return '#A19FA9'

In [1724]:
#crear un grafico de barras que colorea la parte importante 
def bar(df, i, title, xlabel, ylabel, coloreo=(None, None)):
    freq = df.iloc[:, i].value_counts()
    
    start, end = coloreo

    colors = [get_color(val,start,end) for val in freq.index]
    
    plt.bar(freq.index, freq.values, color=colors, edgecolor='black')
    
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    
    plt.show()

In [1725]:
#crea un grafico de barras colorinche usando porcentajes
def bar_perc(df, i, title, xlabel, ylabel, coloreo=(None, None)):
    if i < 0 or i >= df.shape[1]:  
        raise ValueError("Column index is out of range.")
    
    freq = df.iloc[:, i].value_counts(normalize=True) * 100
    indices = list(freq.index)
    
    start, end = coloreo

    colors = [get_color(idx,start,end) for idx in indices]

    plt.bar(indices, freq.values, color=colors, edgecolor='black')

    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)

    plt.show()

In [1726]:
def get_billing_month(reg_date, call_date):
    months_since_reg = (call_date.year - reg_date.year) * 12 + (call_date.month - reg_date.month)
    billing_month_start = reg_date + pd.DateOffset(months=months_since_reg)
    return billing_month_start

In [1727]:
#Ajuste para que todos los resultados estadisticos tengan 2 decimales
pd.options.display.float_format = '{:.2f}'.format


# Preview

In [1728]:
#Viendo el shape
print(dfChurn.shape)

(10000, 14)


Podemos ver que dfChurn consiste de 14 columnas y 10 000 registros

In [1729]:
#viendo las columnas de todos los df para entender como se relacionan
print('Churn: \n',dfChurn.columns,'\n\n\n\n')
print('Calls: \n',dfCalls.columns,'\n\n\n\n')
print('Internet: \n',dfInternet.columns,'\n\n\n\n')
print('Messages: \n',dfMessages.columns,'\n\n\n\n')
print('Users: \n',dfUsers.columns,'\n\n\n\n')

Churn: 
 Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object') 




Calls: 
 Index(['id', 'user_id', 'call_date', 'duration'], dtype='object') 




Internet: 
 Index(['id', 'user_id', 'session_date', 'mb_used'], dtype='object') 




Messages: 
 Index(['id', 'user_id', 'message_date'], dtype='object') 




Users: 
 Index(['user_id', 'first_name', 'last_name', 'age', 'city', 'reg_date', 'plan',
       'churn_date'],
      dtype='object') 






Visualicemos los datos

In [1730]:
#visualizando el header de los df para ver un poco como son los datos
print('Churn: \n',dfChurn.head(),'\n\n\n\n')
print('Calls: \n',dfCalls.head(),'\n\n\n\n')
print('Internet: \n',dfInternet.head(),'\n\n\n\n')
print('Messages: \n',dfMessages.head(),'\n\n\n\n')
print('Users: \n',dfUsers.head(),'\n\n\n\n') 

Churn: 
    RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure   Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0    2.00      0.00              1          1               1   
1    1.00  83807.86              1          0               1   
2    8.00 159660.80              3          1               0   
3    1.00      0.00              2          0               0   
4    2.00 125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         

Hacer un one big table no parece ser mala idea. Esto se utilizaría para ML o algo similar. Atributos a crear podrían ser: cantidad de llamadas, veces usadas el internet, total de internet usado, cantidad de mensajes, tiempo de uso (cuanto tiempo llevan siendo miembros, o sea la fecha final de uso registrado restado la inicial), frecuencia de uso (conteo de total de uso dividido para cantitidad de uso). No nos interesa ni el primer nombre ni el apellido. 

Quizá hacer dummies con el estado. 

Dummies con el plan

Surmane no me sirve

Lo demás debería de servirnos bien para ML, normalizando y/o estandarizando.

# dfChurn

## **RowNumber**

Antes de hacer drop, tenemos que ver si hay valores ausentes, duplicados o atípicos.

Podemos ver que el df_aisles contiene 134 filas y 2 columnas

No tiene mucho sentido hacer describe ya que una columna es el id y la otra una string con el nombre del aisle

### Valores nulos

In [1731]:
#viendo si hay valores nulos
print(dfChurn.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           9091 non-null   float64
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB
None


El único atributo con valores nulos es Tenure, veamos si hay algún motivo.

In [1732]:
#haciendo un df con los nulos
dfNull = dfChurn[dfChurn.iloc[:,7].isna()]

In [1733]:
for col in dfNull.columns:
    print(col, ": ", dfNull[col].nunique())

RowNumber :  909
CustomerId :  909
Surname :  665
CreditScore :  349
Geography :  3
Gender :  2
Age :  58
Tenure :  0
Balance :  576
NumOfProducts :  4
HasCrCard :  2
IsActiveMember :  2
EstimatedSalary :  909
Exited :  2


In [1734]:
for col in dfChurn.columns:
    print(col, ": ", dfChurn[col].nunique())

RowNumber :  10000
CustomerId :  10000
Surname :  2932
CreditScore :  460
Geography :  3
Gender :  2
Age :  70
Tenure :  11
Balance :  6382
NumOfProducts :  4
HasCrCard :  2
IsActiveMember :  2
EstimatedSalary :  9999
Exited :  2


No parece haber una explicación directa a por que pasan estos valores nulos. No parece haber algo notorio con respecto a los datos totales. Usemos el user_id para comparar con las otras tablas

In [1735]:
dfChurnCalls = dfCalls[dfCalls.iloc[:,1].isin(dfNull.iloc[:,1])]
dfChurnInternet = dfInternet[dfInternet.iloc[:,1].isin(dfNull.iloc[:,1])]
dfChurnMessages = dfMessages[dfMessages.iloc[:,1].isin(dfNull.iloc[:,1])]
dfChurnPlans = dfPlans[dfPlans.iloc[:,1].isin(dfNull.iloc[:,1])]
dfChurnUsers = dfUsers[dfUsers.iloc[:,0].isin(dfNull.iloc[:,1])]

In [1736]:
dfChurnNull = dfChurn[dfChurn.iloc[:,[]].isin(dfNull.iloc[:,1])]

In [1737]:
dfNull

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
30,31,15589475,Azikiwe,591,Spain,Female,39,,0.00,3,1,0,140469.38,1
48,49,15766205,Yin,550,Germany,Male,38,,103391.38,1,0,1,90878.13,0
51,52,15768193,Trevisani,585,Germany,Male,36,,146050.97,2,0,0,86424.57,0
53,54,15702298,Parkhill,655,Germany,Male,41,,125561.97,1,0,0,164040.94,1
60,61,15651280,Hunter,742,Germany,Male,35,,136857.00,1,0,0,84509.57,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9944,9945,15703923,Cameron,744,Germany,Male,41,,190409.34,2,1,1,138361.48,0
9956,9957,15707861,Nucci,520,France,Female,46,,85216.61,1,1,0,117369.52,1
9964,9965,15642785,Douglas,479,France,Male,34,,117593.48,2,0,0,113308.29,0
9985,9986,15586914,Nepean,659,France,Male,36,,123841.49,2,1,0,96833.00,0


In [1738]:
dfChurn[dfChurn.iloc[:,[2,4,5,6]].duplicated(keep=False)].sort_values(by='Surname')

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
3466,3467,15631339,Adams,791,France,Male,28,4.00,0.00,1,1,0,174435.48,0
8926,8927,15709136,Adams,620,France,Male,28,8.00,0.00,2,1,1,199909.32,0
8825,8826,15797081,Ajuluchukwu,611,Germany,Female,49,9.00,115488.52,2,1,1,138656.81,1
9783,9784,15680430,Ajuluchukwu,601,Germany,Female,49,4.00,96252.98,2,1,0,104263.82,0
3843,3844,15677804,Aliyeva,783,Spain,Male,38,1.00,0.00,3,1,1,80178.54,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9570,9571,15624936,Yen,631,France,Male,35,8.00,129205.49,1,1,1,79146.36,0
8308,8309,15719898,Young,556,France,Male,36,7.00,154872.08,2,1,1,32044.64,0
5298,5299,15794187,Young,695,France,Male,36,6.00,114007.50,2,1,0,118120.88,0
3277,3278,15749381,Yu,790,France,Female,41,2.00,126619.27,1,1,0,198224.38,0


Ya que estamos apurados, vamos a hacer drop a los nulos

In [1739]:
check_duplicate(dfChurn)

There are 0 duplicates in the RowNumber
There are 0 duplicates in the CustomerId
There are 7068 duplicates in the Surname
There are 9540 duplicates in the CreditScore
There are 9997 duplicates in the Geography
There are 9998 duplicates in the Gender
There are 9930 duplicates in the Age
There are 9988 duplicates in the Tenure
There are 3618 duplicates in the Balance
There are 9996 duplicates in the NumOfProducts
There are 9998 duplicates in the HasCrCard
There are 9998 duplicates in the IsActiveMember
There are 1 duplicates in the EstimatedSalary
There are 9998 duplicates in the Exited


No entiendo como relacionar esto con las otras tablas. Estoy bajo la impresion de que son 2 grupos commpletamente distintos así que los voy a tratar asi.

## Data Wrangling

In [1740]:
#Por falta de tiempo
dfChurn.dropna(inplace=True)

In [1741]:
print(dfChurn.info())

<class 'pandas.core.frame.DataFrame'>
Index: 9091 entries, 0 to 9998
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        9091 non-null   int64  
 1   CustomerId       9091 non-null   int64  
 2   Surname          9091 non-null   object 
 3   CreditScore      9091 non-null   int64  
 4   Geography        9091 non-null   object 
 5   Gender           9091 non-null   object 
 6   Age              9091 non-null   int64  
 7   Tenure           9091 non-null   float64
 8   Balance          9091 non-null   float64
 9   NumOfProducts    9091 non-null   int64  
 10  HasCrCard        9091 non-null   int64  
 11  IsActiveMember   9091 non-null   int64  
 12  EstimatedSalary  9091 non-null   float64
 13  Exited           9091 non-null   int64  
dtypes: float64(3), int64(8), object(3)
memory usage: 1.0+ MB
None


In [1742]:
print(dfChurn)

      RowNumber  CustomerId    Surname  CreditScore Geography  Gender  Age  \
0             1    15634602   Hargrave          619    France  Female   42   
1             2    15647311       Hill          608     Spain  Female   41   
2             3    15619304       Onio          502    France  Female   42   
3             4    15701354       Boni          699    France  Female   39   
4             5    15737888   Mitchell          850     Spain  Female   43   
...         ...         ...        ...          ...       ...     ...  ...   
9994       9995    15719294       Wood          800    France  Female   29   
9995       9996    15606229   Obijiaku          771    France    Male   39   
9996       9997    15569892  Johnstone          516    France    Male   35   
9997       9998    15584532        Liu          709    France  Female   36   
9998       9999    15682355  Sabbatini          772   Germany    Male   42   

      Tenure   Balance  NumOfProducts  HasCrCard  IsActiveMembe

No nos interesa el RowNumber, el ID ni el Surname. Tienes que hacer dummies con Geography y Gender 

In [1743]:
dfChurn.drop(['RowNumber','CustomerId','Surname'], axis = 1, inplace = True)

## Feature Engineering

In [1744]:
dfChurn['Gender'] = dfChurn['Gender'].map({'Male': 0, 'Female': 1})

In [1745]:
dfChurn = pd.get_dummies(dfChurn, columns=['Geography'])

In [1746]:
print((dfChurn['Exited']==1).sum())

print((dfChurn['Exited']==0).sum())

print((dfChurn['Exited']==1).sum()/((dfChurn['Exited'] == 0).sum() + (dfChurn['Exited']==1).sum())*100)

print((1-(dfChurn['Exited']==1).sum()/((dfChurn['Exited'] == 0).sum() + (dfChurn['Exited']==1).sum()))*100)


1854
7237
20.393796062039378
79.60620393796061


Las clases están claramente desbalanceadas. Un poco mas del 20% pertenecen los que si se salieron.
Nuestro modelo tiene que tener un accuracy minimo de 79.61%

In [1747]:
dfChurn.shape

(9091, 13)

In [1748]:
X = dfChurn.drop(columns=["Exited"]) 
y = dfChurn["Exited"]

scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

X_normalized = pd.DataFrame(X_normalized, columns=X.columns)

print(X_normalized)

      CreditScore  Gender  Age  Tenure  Balance  NumOfProducts  HasCrCard  \
0            0.54    1.00 0.32    0.20     0.00           0.00       1.00   
1            0.52    1.00 0.31    0.10     0.33           0.00       0.00   
2            0.30    1.00 0.32    0.80     0.64           0.67       1.00   
3            0.70    1.00 0.28    0.10     0.00           0.33       0.00   
4            1.00    1.00 0.34    0.20     0.50           0.00       1.00   
...           ...     ...  ...     ...      ...            ...        ...   
9086         0.90    1.00 0.15    0.20     0.00           0.33       0.00   
9087         0.84    0.00 0.28    0.50     0.00           0.33       1.00   
9088         0.33    0.00 0.23    1.00     0.23           0.00       1.00   
9089         0.72    1.00 0.24    0.70     0.00           0.00       0.00   
9090         0.84    0.00 0.32    0.30     0.30           0.33       1.00   

      IsActiveMember  EstimatedSalary  Geography_France  Geography_Germany 

In [1749]:
#implementacion propia de regresion
import numpy as np

X_b = np.c_[np.ones((X_normalized.shape[0], 1)), X_normalized]  

theta = np.linalg.inv(X_b.T @ X_b) @ X_b.T @ y

print("Coeficientes:", theta)

Coeficientes: [ 0.93835449 -0.03702474  0.06588786  0.73468744 -0.02098514 -0.25132008
 -0.16851267 -0.00317221 -0.13045233  0.0212724  -0.05346507  0.56689453
  0.31420898]


In [1750]:
# Define features and target
X = dfChurn.drop(columns=['Exited'])  # Assuming 'Exited' is the target
y = dfChurn['Exited']

# Normalize data
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Split data (80% train, 10% validation, 10% test)
X_train, X_temp, y_train, y_temp = train_test_split(X_normalized, y, test_size=0.2, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

print(f'Training set: {X_train.shape}, Validation set: {X_val.shape}, Test set: {X_test.shape}')

# Convert X_train and X_test to DataFrame to ensure feature selection compatibility
X_train_df = pd.DataFrame(X_train, columns=X.columns)
X_test_df = pd.DataFrame(X_test, columns=X.columns)

# 1. Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))

# 2. Ridge Classifier (L2 Regularization)
ridge_clf = RidgeClassifier()
ridge_clf.fit(X_train, y_train)
y_pred_ridge = ridge_clf.predict(X_test)
print("Ridge Classifier Accuracy:", accuracy_score(y_test, y_pred_ridge))
print(classification_report(y_test, y_pred_ridge))

# 3. Lasso for Feature Selection + Logistic Regression
lasso = Lasso(alpha=0.001)
lasso.fit(X_train_df, y_train)
selected_features = np.where(lasso.coef_ != 0)[0]
X_train_selected = X_train_df.iloc[:, selected_features]
X_test_selected = X_test_df.iloc[:, selected_features]

log_reg_lasso = LogisticRegression()
log_reg_lasso.fit(X_train_selected, y_train)
y_pred_lasso = log_reg_lasso.predict(X_test_selected)
print("Logistic Regression (Lasso-selected features) Accuracy:", accuracy_score(y_test, y_pred_lasso))
print(classification_report(y_test, y_pred_lasso))

Training set: (7272, 12), Validation set: (909, 12), Test set: (910, 12)
Logistic Regression Accuracy: 0.8076923076923077
              precision    recall  f1-score   support

           0       0.83      0.96      0.89       724
           1       0.58      0.21      0.31       186

    accuracy                           0.81       910
   macro avg       0.70      0.59      0.60       910
weighted avg       0.78      0.81      0.77       910

Ridge Classifier Accuracy: 0.8065934065934066
              precision    recall  f1-score   support

           0       0.81      0.98      0.89       724
           1       0.63      0.13      0.21       186

    accuracy                           0.81       910
   macro avg       0.72      0.55      0.55       910
weighted avg       0.78      0.81      0.75       910

Logistic Regression (Lasso-selected features) Accuracy: 0.8065934065934066
              precision    recall  f1-score   support

           0       0.83      0.96      0.89     

Vamos a por el segundo df, veamos que tenemos que hacer aca

In [1751]:
print(dfCalls)
print(dfInternet)
print(dfMessages)
print(dfPlans)
print(dfUsers)

              id  user_id   call_date  duration
0        1000_93     1000  2018-12-27      8.52
1       1000_145     1000  2018-12-27     13.66
2       1000_247     1000  2018-12-27     14.48
3       1000_309     1000  2018-12-28      5.76
4       1000_380     1000  2018-12-30      4.22
...          ...      ...         ...       ...
137730  1499_199     1499  2018-11-21      8.72
137731  1499_200     1499  2018-10-20     10.89
137732  1499_201     1499  2018-09-21      8.12
137733  1499_202     1499  2018-10-10      0.37
137734  1499_203     1499  2018-12-29     13.86

[137735 rows x 4 columns]
              id  user_id session_date  mb_used
0        1000_13     1000   2018-12-29    89.86
1       1000_204     1000   2018-12-31     0.00
2       1000_379     1000   2018-12-28   660.40
3       1000_413     1000   2018-12-26   270.99
4       1000_442     1000   2018-12-27   880.22
...          ...      ...          ...      ...
104820  1499_215     1499   2018-10-20   218.06
104821  1499_

## dfCalls

In [1752]:
print(dfCalls.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137735 entries, 0 to 137734
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   id         137735 non-null  object 
 1   user_id    137735 non-null  int64  
 2   call_date  137735 non-null  object 
 3   duration   137735 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 4.2+ MB
None


No hay nulos

In [1753]:
check_duplicate(dfCalls)

There are 0 duplicates in the id
There are 137254 duplicates in the user_id
There are 137384 duplicates in the call_date
There are 134933 duplicates in the duration


Todo normal aqui

## dfInternet

In [1754]:
print(dfInternet.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104825 entries, 0 to 104824
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            104825 non-null  object 
 1   user_id       104825 non-null  int64  
 2   session_date  104825 non-null  object 
 3   mb_used       104825 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 3.2+ MB
None


Todo bien

In [1755]:
check_duplicate(dfInternet)

There are 0 duplicates in the id
There are 104336 duplicates in the user_id
There are 104474 duplicates in the session_date
There are 47201 duplicates in the mb_used


Obvio toca ver como combinaciones de las 3 que no son el id para ver duplicados pero por el apuro diremos que todo bien

## dfMessages

In [1756]:
print(dfMessages.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76051 entries, 0 to 76050
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            76051 non-null  object
 1   user_id       76051 non-null  int64 
 2   message_date  76051 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.7+ MB
None


No hay nulos, chevere

In [1757]:
check_duplicate(dfMessages)

There are 0 duplicates in the id
There are 75649 duplicates in the user_id
There are 75700 duplicates in the message_date


Complicado ver si hay duplicados considerando que es date y no datetime

df plans
df users


# dfPlans

In [1758]:
print(dfPlans.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   messages_included      2 non-null      int64  
 1   mb_per_month_included  2 non-null      int64  
 2   minutes_included       2 non-null      int64  
 3   usd_monthly_pay        2 non-null      int64  
 4   usd_per_gb             2 non-null      int64  
 5   usd_per_message        2 non-null      float64
 6   usd_per_minute         2 non-null      float64
 7   plan_name              2 non-null      object 
dtypes: float64(2), int64(5), object(1)
memory usage: 256.0+ bytes
None


In [1759]:
print(dfPlans)

   messages_included  mb_per_month_included  minutes_included  \
0                 50                  15360               500   
1               1000                  30720              3000   

   usd_monthly_pay  usd_per_gb  usd_per_message  usd_per_minute plan_name  
0               20          10             0.03            0.03      surf  
1               70           7             0.01            0.01  ultimate  


Normal que no tenga nulos

In [1760]:
check_duplicate(dfPlans)

There are 0 duplicates in the messages_included
There are 0 duplicates in the mb_per_month_included
There are 0 duplicates in the minutes_included
There are 0 duplicates in the usd_monthly_pay
There are 0 duplicates in the usd_per_gb
There are 0 duplicates in the usd_per_message
There are 0 duplicates in the usd_per_minute
There are 0 duplicates in the plan_name


XD

# dfUsers

In [1761]:
print(dfUsers.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_id     500 non-null    int64 
 1   first_name  500 non-null    object
 2   last_name   500 non-null    object
 3   age         500 non-null    int64 
 4   city        500 non-null    object
 5   reg_date    500 non-null    object
 6   plan        500 non-null    object
 7   churn_date  34 non-null     object
dtypes: int64(2), object(6)
memory usage: 31.4+ KB
None


Nulos solo en el churn date, esto no nos ayuda en ML y los nulos deben de ser aquellos que no se han ido.

In [1762]:
check_duplicate(dfUsers)

There are 0 duplicates in the user_id
There are 42 duplicates in the first_name
There are 101 duplicates in the last_name
There are 442 duplicates in the age
There are 427 duplicates in the city
There are 234 duplicates in the reg_date
There are 498 duplicates in the plan
There are 470 duplicates in the churn_date


Para ver verdaderos duplicados veriamos la combinacion de nombres, edad y ciudad. Aun asi no habria nada certero.

Creemos una variable donde si churn_date sea = null, esta nueva variable churn sea 0 cuando es nulo, casi contrario que sea 1. De ahi hacemos unas variables que sean conteo de llamadas mensajes e internet por usuario. De ahi unimos todo esto y hacemos drop a lo que no nos sivrve. Tenemos que hacer dummies de la ciudad. Ver la cantidad de plata que ha pagado multiplicando totales por lo que dice cada plan

In [1763]:
dfUsers['churn'] = dfUsers['churn_date'].notna().astype(int)

In [1764]:
dfUsers.iloc[:,-2].unique()


array([nan, '2018-12-18', '2018-11-16', '2018-09-07', '2018-12-30',
       '2018-10-07', '2018-12-31', '2018-11-24', '2018-11-11',
       '2018-12-12', '2018-11-14', '2018-12-27', '2018-11-29',
       '2018-12-22', '2018-11-30', '2018-10-13', '2018-07-31',
       '2018-12-15', '2018-12-19', '2018-10-03', '2018-10-22',
       '2018-08-16', '2018-12-26', '2018-09-01', '2018-11-21',
       '2018-08-19', '2018-12-10', '2018-09-17', '2018-11-18',
       '2018-09-18'], dtype=object)

In [1765]:
print(dfUsers[(~dfUsers.iloc[:,-2].isna()) & (dfUsers.iloc[:,-1])==0])

     user_id first_name   last_name  age  \
0       1000   Anamaria       Bauer   45   
1       1001     Mickey   Wilkerson   28   
2       1002     Carlee     Hoffman   36   
3       1003   Reynaldo     Jenkins   52   
4       1004    Leonila    Thompson   40   
..       ...        ...         ...  ...   
495     1495      Fidel      Sharpe   67   
496     1496      Ariel    Shepherd   49   
497     1497      Donte     Barrera   49   
498     1498       Scot  Williamson   51   
499     1499      Shena     Dickson   37   

                                          city    reg_date      plan  \
0        Atlanta-Sandy Springs-Roswell, GA MSA  2018-12-24  ultimate   
1              Seattle-Tacoma-Bellevue, WA MSA  2018-08-13      surf   
2         Las Vegas-Henderson-Paradise, NV MSA  2018-10-21      surf   
3                                Tulsa, OK MSA  2018-01-28      surf   
4              Seattle-Tacoma-Bellevue, WA MSA  2018-05-23      surf   
..                                     

In [1766]:
print(dfUsers.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_id     500 non-null    int64 
 1   first_name  500 non-null    object
 2   last_name   500 non-null    object
 3   age         500 non-null    int64 
 4   city        500 non-null    object
 5   reg_date    500 non-null    object
 6   plan        500 non-null    object
 7   churn_date  34 non-null     object
 8   churn       500 non-null    int64 
dtypes: int64(3), object(6)
memory usage: 35.3+ KB
None


Listo, ahora el conteo en los otros.

In [1767]:
print(dfCalls)
print(dfInternet)
print(dfMessages)
print(dfPlans)

              id  user_id   call_date  duration
0        1000_93     1000  2018-12-27      8.52
1       1000_145     1000  2018-12-27     13.66
2       1000_247     1000  2018-12-27     14.48
3       1000_309     1000  2018-12-28      5.76
4       1000_380     1000  2018-12-30      4.22
...          ...      ...         ...       ...
137730  1499_199     1499  2018-11-21      8.72
137731  1499_200     1499  2018-10-20     10.89
137732  1499_201     1499  2018-09-21      8.12
137733  1499_202     1499  2018-10-10      0.37
137734  1499_203     1499  2018-12-29     13.86

[137735 rows x 4 columns]
              id  user_id session_date  mb_used
0        1000_13     1000   2018-12-29    89.86
1       1000_204     1000   2018-12-31     0.00
2       1000_379     1000   2018-12-28   660.40
3       1000_413     1000   2018-12-26   270.99
4       1000_442     1000   2018-12-27   880.22
...          ...      ...          ...      ...
104820  1499_215     1499   2018-10-20   218.06
104821  1499_

In [1768]:
dfCalls.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137735 entries, 0 to 137734
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   id         137735 non-null  object 
 1   user_id    137735 non-null  int64  
 2   call_date  137735 non-null  object 
 3   duration   137735 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 4.2+ MB


In [1769]:
dfCalls['call_date'] = pd.to_datetime(dfCalls['call_date'])

In [1770]:
dfUsers['plan_name'] = dfUsers['plan']
dfUsers["reg_date"] = pd.to_datetime(dfUsers["reg_date"])

In [1771]:
dfPlans = dfPlans.merge(dfUsers[['user_id', 'plan_name','reg_date']], on='plan_name', how='inner')

In [1772]:
dfCalls = dfCalls.merge(dfPlans[['user_id','minutes_included','usd_per_minute','reg_date']], on='user_id', how='inner')

In [1773]:
dfCalls["billing_month"] = dfCalls.apply(lambda row: get_billing_month(row["reg_date"], row["call_date"]), axis=1)

df_agg = dfCalls.groupby(["user_id", "billing_month"]).agg(
    reg_date=("reg_date", "first"),
    total_minutes=("duration", "sum"),
    minutes_included=("minutes_included", "first"),
    usd_per_minute=("usd_per_minute", "first")
).reset_index()

df_agg["extra_minutes"] = np.maximum(df_agg["total_minutes"] - df_agg["minutes_included"], 0)
df_agg["extra_charges_minutes"] = df_agg["extra_minutes"] * df_agg["usd_per_minute"]

In [1774]:
df_agg[df_agg.loc[:,'extra_charges_minutes'] > 0]

Unnamed: 0,user_id,billing_month,reg_date,total_minutes,minutes_included,usd_per_minute,extra_minutes,extra_charges_minutes
9,1003,2018-12-28,2018-01-28,1041.00,500,0.03,541.00,16.23
23,1007,2018-10-11,2018-07-11,610.65,500,0.03,110.65,3.32
25,1007,2018-12-11,2018-07-11,579.43,500,0.03,79.43,2.38
30,1009,2018-06-22,2018-04-22,775.52,500,0.03,275.52,8.27
31,1009,2018-07-22,2018-04-22,826.74,500,0.03,326.74,9.80
...,...,...,...,...,...,...,...,...
2197,1488,2018-11-15,2018-05-15,751.66,500,0.03,251.66,7.55
2198,1488,2018-12-15,2018-05-15,678.14,500,0.03,178.14,5.34
2215,1491,2018-10-27,2018-04-27,551.89,500,0.03,51.89,1.56
2230,1494,2018-10-19,2018-04-19,585.82,500,0.03,85.82,2.57


Bueno, ahora es hora de obtener un total haciendo group por user_id y ya estamos

In [1775]:
df_group = df_agg.groupby('user_id').agg(
    extra_minutes=('extra_minutes', 'sum'),
    extra_charges_minutes =('extra_charges_minutes', 'sum')
).reset_index()

df_group

Unnamed: 0,user_id,extra_minutes,extra_charges_minutes
0,1000,0.00,0.00
1,1001,0.00,0.00
2,1002,0.00,0.00
3,1003,541.00,16.23
4,1004,0.00,0.00
...,...,...,...
476,1495,78.03,2.34
477,1496,0.00,0.00
478,1497,0.00,0.00
479,1498,0.00,0.00


In [1776]:
df_calls_total = dfCalls.groupby('user_id').agg(
    latest = ('call_date','max'),
    total_calls=('user_id', 'size'),        
    total_minutes=('duration', 'sum')
).reset_index()

df_calls_total['promedio_minutes_por_ocasion'] = df_calls_total['total_minutes'] / df_calls_total['total_calls']

df_calls_total = df_calls_total.merge(df_group,on = 'user_id',how='inner')

Agreguemos la cantidad de meses

In [1777]:
df_calls_total

Unnamed: 0,user_id,latest,total_calls,total_minutes,promedio_minutes_por_ocasion,extra_minutes,extra_charges_minutes
0,1000,2018-12-31,16,116.83,7.30,0.00,0.00
1,1001,2018-12-31,261,1640.46,6.29,0.00,0.00
2,1002,2018-12-31,113,777.13,6.88,0.00,0.00
3,1003,2018-12-31,149,1041.00,6.99,541.00,16.23
4,1004,2018-12-30,370,2618.95,7.08,0.00,0.00
...,...,...,...,...,...,...,...
476,1495,2018-12-31,253,1666.41,6.59,78.03,2.34
477,1496,2018-12-31,195,1376.21,7.06,0.00,0.00
478,1497,2018-12-31,54,276.53,5.12,0.00,0.00
479,1498,2018-12-31,451,3029.97,6.72,0.00,0.00


In [1778]:
dfCalls.nunique()
#vemos que solo hay 481 user_id, para los que no esten ponemos 0

id                  137735
user_id                481
call_date              351
duration              2802
minutes_included         2
usd_per_minute           2
reg_date               259
billing_month          307
dtype: int64

In [1779]:
df_calls_total

Unnamed: 0,user_id,latest,total_calls,total_minutes,promedio_minutes_por_ocasion,extra_minutes,extra_charges_minutes
0,1000,2018-12-31,16,116.83,7.30,0.00,0.00
1,1001,2018-12-31,261,1640.46,6.29,0.00,0.00
2,1002,2018-12-31,113,777.13,6.88,0.00,0.00
3,1003,2018-12-31,149,1041.00,6.99,541.00,16.23
4,1004,2018-12-30,370,2618.95,7.08,0.00,0.00
...,...,...,...,...,...,...,...
476,1495,2018-12-31,253,1666.41,6.59,78.03,2.34
477,1496,2018-12-31,195,1376.21,7.06,0.00,0.00
478,1497,2018-12-31,54,276.53,5.12,0.00,0.00
479,1498,2018-12-31,451,3029.97,6.72,0.00,0.00


In [1780]:
dfInternet.nunique()

id              104825
user_id            489
session_date       351
mb_used          57624
dtype: int64

In [1781]:
dfInternet['session_date'] = pd.to_datetime(dfInternet['session_date'])

In [1782]:
dfPlans['usd_per_mb'] = dfPlans['usd_per_gb'].astype(float) / 1000

In [1783]:
dfPlans

Unnamed: 0,messages_included,mb_per_month_included,minutes_included,usd_monthly_pay,usd_per_gb,usd_per_message,usd_per_minute,plan_name,user_id,reg_date,usd_per_mb
0,50,15360,500,20,10,0.03,0.03,surf,1001,2018-08-13,0.01
1,50,15360,500,20,10,0.03,0.03,surf,1002,2018-10-21,0.01
2,50,15360,500,20,10,0.03,0.03,surf,1003,2018-01-28,0.01
3,50,15360,500,20,10,0.03,0.03,surf,1004,2018-05-23,0.01
4,50,15360,500,20,10,0.03,0.03,surf,1005,2018-11-29,0.01
...,...,...,...,...,...,...,...,...,...,...,...
495,1000,30720,3000,70,7,0.01,0.01,ultimate,1482,2018-01-01,0.01
496,1000,30720,3000,70,7,0.01,0.01,ultimate,1487,2018-08-16,0.01
497,1000,30720,3000,70,7,0.01,0.01,ultimate,1490,2018-01-20,0.01
498,1000,30720,3000,70,7,0.01,0.01,ultimate,1493,2018-03-06,0.01


In [1784]:
dfInternet

Unnamed: 0,id,user_id,session_date,mb_used
0,1000_13,1000,2018-12-29,89.86
1,1000_204,1000,2018-12-31,0.00
2,1000_379,1000,2018-12-28,660.40
3,1000_413,1000,2018-12-26,270.99
4,1000_442,1000,2018-12-27,880.22
...,...,...,...,...
104820,1499_215,1499,2018-10-20,218.06
104821,1499_216,1499,2018-12-30,304.72
104822,1499_217,1499,2018-09-22,292.75
104823,1499_218,1499,2018-12-07,0.00


In [1785]:
dfInternet = dfInternet.merge(dfPlans[['user_id','mb_per_month_included','usd_per_gb','reg_date']], on='user_id', how='inner')

In [1786]:
dfInternet

Unnamed: 0,id,user_id,session_date,mb_used,mb_per_month_included,usd_per_gb,reg_date
0,1000_13,1000,2018-12-29,89.86,30720,7,2018-12-24
1,1000_204,1000,2018-12-31,0.00,30720,7,2018-12-24
2,1000_379,1000,2018-12-28,660.40,30720,7,2018-12-24
3,1000_413,1000,2018-12-26,270.99,30720,7,2018-12-24
4,1000_442,1000,2018-12-27,880.22,30720,7,2018-12-24
...,...,...,...,...,...,...,...
104820,1499_215,1499,2018-10-20,218.06,15360,10,2018-05-06
104821,1499_216,1499,2018-12-30,304.72,15360,10,2018-05-06
104822,1499_217,1499,2018-09-22,292.75,15360,10,2018-05-06
104823,1499_218,1499,2018-12-07,0.00,15360,10,2018-05-06


In [1787]:
dfInternet["billing_month"] = dfInternet.apply(lambda row: get_billing_month(row["reg_date"], row["session_date"]), axis=1)

df_agg = dfInternet.groupby(["user_id", "billing_month"]).agg(
    reg_date=("reg_date", "first"),
    total_mb=("mb_used", "sum"),
    mb_included=("mb_per_month_included", "first"),
    usd_per_gb=("usd_per_gb", "first")
).reset_index()

df_agg["extra_mb"] = np.maximum(df_agg["total_mb"] - df_agg["mb_included"], 0)

df_agg["extra_charges_gb"] = ((df_agg["extra_mb"] // 1000) * df_agg["usd_per_gb"])

In [1788]:
df_agg[df_agg.loc[:,'extra_charges_gb'] > 0]

Unnamed: 0,user_id,billing_month,reg_date,total_mb,mb_included,usd_per_gb,extra_mb,extra_charges_gb
3,1001,2018-10-13,2018-08-13,22330.49,15360,10,6970.49,60.00
4,1001,2018-11-13,2018-08-13,18504.30,15360,10,3144.30,30.00
5,1001,2018-12-13,2018-08-13,19369.18,15360,10,4009.18,40.00
7,1002,2018-11-21,2018-10-21,19345.08,15360,10,3985.08,30.00
9,1003,2018-12-28,2018-01-28,27044.14,15360,10,11684.14,110.00
...,...,...,...,...,...,...,...,...
2271,1498,2018-11-04,2018-02-04,19168.55,15360,10,3808.55,30.00
2272,1498,2018-12-04,2018-02-04,23137.69,15360,10,7777.69,70.00
2274,1499,2018-10-06,2018-05-06,19492.43,15360,10,4132.43,40.00
2275,1499,2018-11-06,2018-05-06,16813.83,15360,10,1453.83,10.00


Bueno, ahora es hora de obtener un total haciendo group por user_id y ya estamos

In [1789]:
df_group = df_agg.groupby('user_id').agg(
    extra_mb=('extra_mb', 'sum'),
    extra_charges_gb =('extra_charges_gb', 'sum')
).reset_index()

df_group

Unnamed: 0,user_id,extra_mb,extra_charges_gb
0,1000,0.00,0.00
1,1001,14123.97,130.00
2,1002,3985.08,30.00
3,1003,11684.14,110.00
4,1004,43103.97,410.00
...,...,...,...
484,1495,37450.96,350.00
485,1496,2108.26,20.00
486,1497,0.00,0.00
487,1498,59117.95,540.00


In [1790]:
df_internet_total = dfInternet.groupby('user_id').agg(
    total_internet=('user_id', 'size'),        
    total_mb=('mb_used', 'sum'),
    latest = ('session_date','max')
).reset_index()

df_internet_total['promedio_mb_por_ocasion'] = df_internet_total['total_mb'] / df_internet_total['total_internet']

df_internet_total = df_internet_total.merge(df_group,on = 'user_id',how='inner')

In [1791]:
df_internet_total

Unnamed: 0,user_id,total_internet,total_mb,latest,promedio_mb_por_ocasion,extra_mb,extra_charges_gb
0,1000,5,1901.47,2018-12-31,380.29,0.00,0.00
1,1001,245,80437.94,2018-12-31,328.32,14123.97,130.00
2,1002,124,40293.33,2018-12-31,324.95,3985.08,30.00
3,1003,52,27044.14,2018-12-31,520.08,11684.14,110.00
4,1004,460,156352.81,2018-12-31,339.90,43103.97,410.00
...,...,...,...,...,...,...,...
484,1495,290,98890.96,2018-12-31,341.00,37450.96,350.00
485,1496,225,64268.64,2018-12-31,285.64,2108.26,20.00
486,1497,31,11106.55,2018-12-30,358.28,0.00,0.00
487,1498,657,227525.13,2018-12-31,346.31,59117.95,540.00


In [1792]:
dfMessages.nunique()

id              76051
user_id           402
message_date      351
dtype: int64

In [1793]:
dfMessages['message_date'] = pd.to_datetime(dfMessages['message_date'])

In [1794]:
dfPlans

Unnamed: 0,messages_included,mb_per_month_included,minutes_included,usd_monthly_pay,usd_per_gb,usd_per_message,usd_per_minute,plan_name,user_id,reg_date,usd_per_mb
0,50,15360,500,20,10,0.03,0.03,surf,1001,2018-08-13,0.01
1,50,15360,500,20,10,0.03,0.03,surf,1002,2018-10-21,0.01
2,50,15360,500,20,10,0.03,0.03,surf,1003,2018-01-28,0.01
3,50,15360,500,20,10,0.03,0.03,surf,1004,2018-05-23,0.01
4,50,15360,500,20,10,0.03,0.03,surf,1005,2018-11-29,0.01
...,...,...,...,...,...,...,...,...,...,...,...
495,1000,30720,3000,70,7,0.01,0.01,ultimate,1482,2018-01-01,0.01
496,1000,30720,3000,70,7,0.01,0.01,ultimate,1487,2018-08-16,0.01
497,1000,30720,3000,70,7,0.01,0.01,ultimate,1490,2018-01-20,0.01
498,1000,30720,3000,70,7,0.01,0.01,ultimate,1493,2018-03-06,0.01


In [1795]:
dfMessages

Unnamed: 0,id,user_id,message_date
0,1000_125,1000,2018-12-27
1,1000_160,1000,2018-12-31
2,1000_223,1000,2018-12-31
3,1000_251,1000,2018-12-27
4,1000_255,1000,2018-12-26
...,...,...,...
76046,1497_526,1497,2018-12-24
76047,1497_536,1497,2018-12-24
76048,1497_547,1497,2018-12-31
76049,1497_558,1497,2018-12-24


In [1796]:
dfMessages = dfMessages.merge(dfPlans[['user_id','messages_included','usd_per_message','reg_date']], on='user_id', how='inner')

In [1797]:
dfMessages["billing_month"] = dfMessages.apply(lambda row: get_billing_month(row["reg_date"], row["message_date"]), axis=1)

df_agg = dfMessages.groupby(["user_id", "billing_month"]).agg(
    reg_date=("reg_date", "first"),
    total_messages=("message_date", "count"),
    messages_included=("messages_included", "first"),
    usd_per_message=("usd_per_message", "first")
).reset_index()

df_agg["extra_messages"] = np.maximum(df_agg["total_messages"] - df_agg["messages_included"], 0)
df_agg["extra_charges_messages"] = df_agg["extra_messages"] * df_agg["usd_per_message"]

In [1798]:
df_agg[df_agg.loc[:,'extra_charges_messages'] > 0]

Unnamed: 0,user_id,billing_month,reg_date,total_messages,messages_included,usd_per_message,extra_messages,extra_charges_messages
3,1001,2018-10-13,2018-08-13,53,50,0.03,3,0.09
21,1007,2018-08-11,2018-07-11,51,50,0.03,1,0.03
23,1007,2018-10-11,2018-07-11,59,50,0.03,9,0.27
38,1014,2018-12-25,2018-11-25,64,50,0.03,14,0.42
39,1015,2018-12-05,2018-12-05,68,50,0.03,18,0.54
...,...,...,...,...,...,...,...,...
1779,1489,2018-11-24,2018-05-24,91,50,0.03,41,1.23
1780,1489,2018-12-24,2018-05-24,86,50,0.03,36,1.08
1783,1491,2018-06-27,2018-04-27,54,50,0.03,4,0.12
1784,1491,2018-07-27,2018-04-27,64,50,0.03,14,0.42


Bueno, ahora es hora de obtener un total haciendo group por user_id y ya estamos

In [1799]:
df_group = df_agg.groupby('user_id').agg(
    extra_messages=('extra_messages', 'sum'),
    extra_charges_messages =('extra_charges_messages', 'sum')
).reset_index()

df_group

Unnamed: 0,user_id,extra_messages,extra_charges_messages
0,1000,0,0.00
1,1001,3,0.09
2,1002,0,0.00
3,1003,0,0.00
4,1004,0,0.00
...,...,...,...
397,1491,19,0.57
398,1492,0,0.00
399,1494,0,0.00
400,1496,0,0.00


In [1800]:
dfMessages

Unnamed: 0,id,user_id,message_date,messages_included,usd_per_message,reg_date,billing_month
0,1000_125,1000,2018-12-27,1000,0.01,2018-12-24,2018-12-24
1,1000_160,1000,2018-12-31,1000,0.01,2018-12-24,2018-12-24
2,1000_223,1000,2018-12-31,1000,0.01,2018-12-24,2018-12-24
3,1000_251,1000,2018-12-27,1000,0.01,2018-12-24,2018-12-24
4,1000_255,1000,2018-12-26,1000,0.01,2018-12-24,2018-12-24
...,...,...,...,...,...,...,...
76046,1497_526,1497,2018-12-24,1000,0.01,2018-12-10,2018-12-10
76047,1497_536,1497,2018-12-24,1000,0.01,2018-12-10,2018-12-10
76048,1497_547,1497,2018-12-31,1000,0.01,2018-12-10,2018-12-10
76049,1497_558,1497,2018-12-24,1000,0.01,2018-12-10,2018-12-10


In [1801]:
df_messages_total = dfMessages.groupby('user_id').agg(
    total_messages=('message_date', 'count'),
    latest = ('message_date','max')
).reset_index()

df_messages_total = df_messages_total.merge(df_group,on = 'user_id',how='inner')

In [1802]:
df_messages_total

Unnamed: 0,user_id,total_messages,latest,extra_messages,extra_charges_messages
0,1000,11,2018-12-31,0,0.00
1,1001,207,2018-12-30,3,0.09
2,1002,88,2018-12-31,0,0.00
3,1003,50,2018-12-31,0,0.00
4,1004,177,2018-12-30,0,0.00
...,...,...,...,...,...
397,1491,409,2018-12-31,19,0.57
398,1492,108,2018-12-31,0,0.00
399,1494,174,2018-12-31,0,0.00
400,1496,65,2018-12-30,0,0.00


In [1803]:
df_ci = df_calls_total.merge(df_internet_total,how='outer',on = 'user_id')

df_ci['latest'] = df_ci[['latest_x', 'latest_y']].max(axis=1)

df_ci = df_ci.drop(columns=['latest_x', 'latest_y'])

df_cim = df_ci.merge(df_messages_total,how='outer',on = 'user_id')

df_cim['latest'] = df_cim[['latest_x', 'latest_y']].max(axis=1)

df_cim = df_cim.drop(columns=['latest_x', 'latest_y'])


In [1804]:
df_cim

Unnamed: 0,user_id,total_calls,total_minutes,promedio_minutes_por_ocasion,extra_minutes,extra_charges_minutes,total_internet,total_mb,promedio_mb_por_ocasion,extra_mb,extra_charges_gb,total_messages,extra_messages,extra_charges_messages,latest
0,1000,16.00,116.83,7.30,0.00,0.00,5.00,1901.47,380.29,0.00,0.00,11.00,0.00,0.00,2018-12-31
1,1001,261.00,1640.46,6.29,0.00,0.00,245.00,80437.94,328.32,14123.97,130.00,207.00,3.00,0.09,2018-12-31
2,1002,113.00,777.13,6.88,0.00,0.00,124.00,40293.33,324.95,3985.08,30.00,88.00,0.00,0.00,2018-12-31
3,1003,149.00,1041.00,6.99,541.00,16.23,52.00,27044.14,520.08,11684.14,110.00,50.00,0.00,0.00,2018-12-31
4,1004,370.00,2618.95,7.08,0.00,0.00,460.00,156352.81,339.90,43103.97,410.00,177.00,0.00,0.00,2018-12-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
485,1495,253.00,1666.41,6.59,78.03,2.34,290.00,98890.96,341.00,37450.96,350.00,,,,2018-12-31
486,1496,195.00,1376.21,7.06,0.00,0.00,225.00,64268.64,285.64,2108.26,20.00,65.00,0.00,0.00,2018-12-31
487,1497,54.00,276.53,5.12,0.00,0.00,31.00,11106.55,358.28,0.00,0.00,50.00,0.00,0.00,2018-12-31
488,1498,451.00,3029.97,6.72,0.00,0.00,657.00,227525.13,346.31,59117.95,540.00,,,,2018-12-31


In [1805]:
#primero hallar cual de los 3 latest es el mayor y de ahi hacer el fill_na
df_cim.fillna(0,inplace=True) 

In [1806]:
df_cim.shape

(490, 15)

In [1807]:
df_cim

Unnamed: 0,user_id,total_calls,total_minutes,promedio_minutes_por_ocasion,extra_minutes,extra_charges_minutes,total_internet,total_mb,promedio_mb_por_ocasion,extra_mb,extra_charges_gb,total_messages,extra_messages,extra_charges_messages,latest
0,1000,16.00,116.83,7.30,0.00,0.00,5.00,1901.47,380.29,0.00,0.00,11.00,0.00,0.00,2018-12-31
1,1001,261.00,1640.46,6.29,0.00,0.00,245.00,80437.94,328.32,14123.97,130.00,207.00,3.00,0.09,2018-12-31
2,1002,113.00,777.13,6.88,0.00,0.00,124.00,40293.33,324.95,3985.08,30.00,88.00,0.00,0.00,2018-12-31
3,1003,149.00,1041.00,6.99,541.00,16.23,52.00,27044.14,520.08,11684.14,110.00,50.00,0.00,0.00,2018-12-31
4,1004,370.00,2618.95,7.08,0.00,0.00,460.00,156352.81,339.90,43103.97,410.00,177.00,0.00,0.00,2018-12-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
485,1495,253.00,1666.41,6.59,78.03,2.34,290.00,98890.96,341.00,37450.96,350.00,0.00,0.00,0.00,2018-12-31
486,1496,195.00,1376.21,7.06,0.00,0.00,225.00,64268.64,285.64,2108.26,20.00,65.00,0.00,0.00,2018-12-31
487,1497,54.00,276.53,5.12,0.00,0.00,31.00,11106.55,358.28,0.00,0.00,50.00,0.00,0.00,2018-12-31
488,1498,451.00,3029.97,6.72,0.00,0.00,657.00,227525.13,346.31,59117.95,540.00,0.00,0.00,0.00,2018-12-31


Ahora tenemos que agregar el costo por cada uno y el costo total. Para esto unamosle ya a una tabla grande y vamos botando columnas

In [1813]:
df = df_cim.merge(dfUsers,how='outer',on = 'user_id')

In [1814]:
df.drop(['first_name', 'last_name', 'city','churn_date','plan'], axis=1, inplace=True, errors='ignore')

In [1815]:
df.shape

(500, 19)

Agregamos months, days y hacemos dummies. De ahi estamos listos para ml


In [1816]:
df["reg_date"] = pd.to_datetime(df["reg_date"], errors='coerce')
df["latest"] = pd.to_datetime(df["latest"], errors='coerce')

df["days_since_reg"] = (df["latest"] - df["reg_date"]).dt.days

df["months_since_reg"] = (df["latest"].dt.year - df["reg_date"].dt.year) * 12 + (df["latest"].dt.month - df["reg_date"].dt.month)

In [1822]:
df

Unnamed: 0,user_id,total_calls,total_minutes,promedio_minutes_por_ocasion,extra_minutes,extra_charges_minutes,total_internet,total_mb,promedio_mb_por_ocasion,extra_mb,extra_charges_gb,total_messages,extra_messages,extra_charges_messages,age,churn,plan_name,days_since_reg,months_since_reg
0,1000,16.00,116.83,7.30,0.00,0.00,5.00,1901.47,380.29,0.00,0.00,11.00,0.00,0.00,45,0,ultimate,7.00,0.00
1,1001,261.00,1640.46,6.29,0.00,0.00,245.00,80437.94,328.32,14123.97,130.00,207.00,3.00,0.09,28,0,surf,140.00,4.00
2,1002,113.00,777.13,6.88,0.00,0.00,124.00,40293.33,324.95,3985.08,30.00,88.00,0.00,0.00,36,0,surf,71.00,2.00
3,1003,149.00,1041.00,6.99,541.00,16.23,52.00,27044.14,520.08,11684.14,110.00,50.00,0.00,0.00,52,0,surf,337.00,11.00
4,1004,370.00,2618.95,7.08,0.00,0.00,460.00,156352.81,339.90,43103.97,410.00,177.00,0.00,0.00,40,0,surf,222.00,7.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,1495,253.00,1666.41,6.59,78.03,2.34,290.00,98890.96,341.00,37450.96,350.00,0.00,0.00,0.00,67,0,surf,118.00,3.00
496,1496,195.00,1376.21,7.06,0.00,0.00,225.00,64268.64,285.64,2108.26,20.00,65.00,0.00,0.00,49,0,surf,314.00,10.00
497,1497,54.00,276.53,5.12,0.00,0.00,31.00,11106.55,358.28,0.00,0.00,50.00,0.00,0.00,49,0,ultimate,21.00,0.00
498,1498,451.00,3029.97,6.72,0.00,0.00,657.00,227525.13,346.31,59117.95,540.00,0.00,0.00,0.00,51,0,surf,330.00,10.00


In [1820]:
df.drop(['latest','reg_date'],inplace=True, axis =1)

In [1823]:
df = pd.get_dummies(df, columns=['plan_name'])

In [1826]:
df.fillna(0,inplace=True)

In [1828]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 20 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   user_id                       500 non-null    int64  
 1   total_calls                   500 non-null    float64
 2   total_minutes                 500 non-null    float64
 3   promedio_minutes_por_ocasion  500 non-null    float64
 4   extra_minutes                 500 non-null    float64
 5   extra_charges_minutes         500 non-null    float64
 6   total_internet                500 non-null    float64
 7   total_mb                      500 non-null    float64
 8   promedio_mb_por_ocasion       500 non-null    float64
 9   extra_mb                      500 non-null    float64
 10  extra_charges_gb              500 non-null    float64
 11  total_messages                500 non-null    float64
 12  extra_messages                500 non-null    float64
 13  extra

Tenemos que vencer a churn/total

In [1831]:
print((df["churn"]==1).sum())

print((df["churn"]==0).sum())

print((df["churn"]==1).sum()/((df["churn"] == 0).sum() + (df["churn"]==1).sum())*100)

print((1-(df["churn"]==1).sum()/((df["churn"] == 0).sum() + (df["churn"]==1).sum()))*100)


34
466
6.800000000000001
93.19999999999999


tenemos que ser mejor que 93.19%

In [1842]:
df_hp = dfUsers.copy()

In [1843]:
df_hp.drop(['user_id','first_name','last_name','city','churn_date','reg_date','plan'],inplace=True, axis = 1)

In [1844]:
df_hp = pd.get_dummies(df_hp, columns=['plan_name'])

In [1845]:
df_hp

Unnamed: 0,age,churn,plan_name_surf,plan_name_ultimate
0,45,0,False,True
1,28,0,True,False
2,36,0,True,False
3,52,0,True,False
4,40,0,True,False
...,...,...,...,...
495,67,0,True,False
496,49,0,True,False
497,49,0,False,True
498,51,0,True,False


In [1846]:
X = df.drop(columns=["churn"]) 
y = df["churn"]

# Normalize data
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Split data (80% train, 10% validation, 10% test)
X_train, X_temp, y_train, y_temp = train_test_split(X_normalized, y, test_size=0.2, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

print(f'Training set: {X_train.shape}, Validation set: {X_val.shape}, Test set: {X_test.shape}')

# Convert X_train and X_test to DataFrame to ensure feature selection compatibility
X_train_df = pd.DataFrame(X_train, columns=X.columns)
X_test_df = pd.DataFrame(X_test, columns=X.columns)

# 1. Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))

# 2. Ridge Classifier (L2 Regularization)
ridge_clf = RidgeClassifier()
ridge_clf.fit(X_train, y_train)
y_pred_ridge = ridge_clf.predict(X_test)
print("Ridge Classifier Accuracy:", accuracy_score(y_test, y_pred_ridge))
print(classification_report(y_test, y_pred_ridge))

# 3. Lasso for Feature Selection + Logistic Regression
lasso = Lasso(alpha=0.001)
lasso.fit(X_train_df, y_train)
selected_features = np.where(lasso.coef_ != 0)[0]
X_train_selected = X_train_df.iloc[:, selected_features]
X_test_selected = X_test_df.iloc[:, selected_features]

log_reg_lasso = LogisticRegression()
log_reg_lasso.fit(X_train_selected, y_train)
y_pred_lasso = log_reg_lasso.predict(X_test_selected)
print("Logistic Regression (Lasso-selected features) Accuracy:", accuracy_score(y_test, y_pred_lasso))
print(classification_report(y_test, y_pred_lasso))

Training set: (400, 3), Validation set: (50, 3), Test set: (50, 3)
Logistic Regression Accuracy: 0.94
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        47
           1       0.00      0.00      0.00         3

    accuracy                           0.94        50
   macro avg       0.47      0.50      0.48        50
weighted avg       0.88      0.94      0.91        50

Ridge Classifier Accuracy: 0.94
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        47
           1       0.00      0.00      0.00         3

    accuracy                           0.94        50
   macro avg       0.47      0.50      0.48        50
weighted avg       0.88      0.94      0.91        50

Logistic Regression (Lasso-selected features) Accuracy: 0.94
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        47
           1       0.00      0.00      0.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Lo único que podría hacer que esto quizá mejore es balancear las clases. VOy a usar el standard en lugar del minmax a ver si hay mejoras

In [1862]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression, RidgeClassifier, LassoCV
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform

# Load data (assuming df is already defined)
X = df.drop(columns=["churn"])
y = df["churn"]

# Count instances in each class
class_counts = y.value_counts()
minority_class = class_counts.idxmin()
majority_class = class_counts.idxmax()

# Get indices of each class
minority_indices = y[y == minority_class].index
majority_indices = y[y == majority_class].index

# Randomly undersample the majority class
np.random.seed(42)  # For reproducibility
undersampled_majority_indices = np.random.choice(majority_indices, size=len(minority_indices), replace=False)

# Combine the undersampled majority class with the minority class
undersampled_indices = np.concatenate([minority_indices, undersampled_majority_indices])
df_resampled = df.loc[undersampled_indices].sample(frac=1, random_state=42)  # Shuffle

# Separate features and labels again
X_resampled = df_resampled.drop(columns=["churn"])
y_resampled = df_resampled["churn"]

# Feature Selection: Remove low-variance features
selector = VarianceThreshold(threshold=0.01)  # Remove features with variance < 0.01
X_resampled_selected = selector.fit_transform(X_resampled)

# Polynomial Features for Non-Linearity (degree=2)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_resampled_poly = poly.fit_transform(X_resampled_selected)

# Normalize using StandardScaler
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled_poly)

# Split data (80% train, 10% validation, 10% test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X_resampled_scaled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

# Convert back to DataFrame with feature names
feature_names = poly.get_feature_names_out()
X_train_df = pd.DataFrame(X_train, columns=feature_names)
X_test_df = pd.DataFrame(X_test, columns=feature_names)

# === Logistic Regression with Hyperparameter Tuning ===
log_reg = LogisticRegression(max_iter=1000)
param_grid = {"C": np.logspace(-3, 3, 10)}
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring="accuracy")
grid_search.fit(X_train, y_train)
best_log_reg = grid_search.best_estimator_

y_pred_log = best_log_reg.predict(X_test)
print("Best Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))

# === Ridge Classifier with Cross-Validation ===
ridge_clf = RidgeClassifier()
cv_scores = cross_val_score(ridge_clf, X_train, y_train, cv=5, scoring="accuracy")
ridge_clf.fit(X_train, y_train)
y_pred_ridge = ridge_clf.predict(X_test)
print("Ridge Classifier Accuracy (Cross-Val Mean):", np.mean(cv_scores))
print(classification_report(y_test, y_pred_ridge))

# === Lasso for Feature Selection + Logistic Regression ===
lasso = LassoCV(cv=5, max_iter=5000)
lasso.fit(X_train_df, y_train)
selected_features = np.where(lasso.coef_ != 0)[0]

X_train_selected = X_train_df.iloc[:, selected_features]
X_test_selected = X_test_df.iloc[:, selected_features]

log_reg_lasso = LogisticRegression(max_iter=1000)
log_reg_lasso.fit(X_train_selected, y_train)
y_pred_lasso = log_reg_lasso.predict(X_test_selected)
print("Logistic Regression (Lasso-selected features) Accuracy:", accuracy_score(y_test, y_pred_lasso))
print(classification_report(y_test, y_pred_lasso))


Best Logistic Regression Accuracy: 0.5714285714285714
              precision    recall  f1-score   support

           0       0.60      0.75      0.67         4
           1       0.50      0.33      0.40         3

    accuracy                           0.57         7
   macro avg       0.55      0.54      0.53         7
weighted avg       0.56      0.57      0.55         7

Ridge Classifier Accuracy (Cross-Val Mean): 0.5872727272727272
              precision    recall  f1-score   support

           0       0.25      0.25      0.25         4
           1       0.00      0.00      0.00         3

    accuracy                           0.14         7
   macro avg       0.12      0.12      0.12         7
weighted avg       0.14      0.14      0.14         7



  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


ValueError: at least one array or dtype is required