In [18]:
import pandas as pd
import numpy as np

In [19]:
df = pd.read_csv('../data/processed/dataset_analitico.csv')

In [20]:
df.columns

Index(['user_id', 'year_month', 'total_calls', 'total_duration',
       'avg_duration', 'total_mb', 'sessions_count', 'total_messages',
       'first_name', 'last_name', 'age', 'city', 'reg_date', 'plan',
       'churn_date', 'messages_included', 'mb_per_month_included',
       'minutes_included', 'usd_monthly_pay', 'usd_per_gb', 'usd_per_message',
       'usd_per_minute'],
      dtype='object')

In [67]:
# Feature engineering con ayuda de claude sonnet 3.7
df_fe = df.copy()

# Convert date columns to datetime
df_fe['reg_date'] = pd.to_datetime(df_fe['reg_date'])
df_fe['churn_date'] = pd.to_datetime(df_fe['churn_date'])
df_fe['year_month'] = pd.to_datetime(df_fe['year_month'] + '-01')

# Flag for churned users (not '2099-12-31')
df_fe['is_churned'] = (df_fe['churn_date'] != pd.to_datetime('2099-12-31')).astype(int)

# Calculate extra usage beyond plan limits
df_fe['extra_minutes'] = np.maximum(0, df_fe['total_duration'] - df_fe['minutes_included'])
df_fe['extra_messages'] = np.maximum(0, df_fe['total_messages'] - df_fe['messages_included'])
df_fe['extra_mb'] = np.maximum(0, df_fe['total_mb'] - df_fe['mb_per_month_included'])


# Usage ratios (% of plan used)
df_fe['minutes_usage_ratio'] = df_fe['total_duration'] / df_fe['minutes_included']
df_fe['messages_usage_ratio'] = df_fe['total_messages'] / df_fe['messages_included']
df_fe['mb_usage_ratio'] = df_fe['total_mb'] / df_fe['mb_per_month_included']


# Calculate cost for extra usage
df_fe['extra_minutes_cost'] = df_fe['extra_minutes'] * df_fe['usd_per_minute']
df_fe['extra_messages_cost'] = df_fe['extra_messages'] * df_fe['usd_per_message']
df_fe['extra_mb_cost'] = (df_fe['extra_mb'] / 1024) * df_fe['usd_per_gb']  # Convert MB to GB

df_fe = df_fe.drop(['reg_date', 'churn_date', 'minutes_included', 'messages_included', 'mb_per_month_included', 'usd_per_minute', 'usd_per_message', 'usd_per_gb'], axis=1)

# Expected total cost
df_fe['total_cost'] = (df_fe['usd_monthly_pay'] + 
                          df_fe['extra_minutes_cost'] + 
                          df_fe['extra_messages_cost'] + 
                          df_fe['extra_mb_cost'])

df_fe = df_fe.drop(['extra_minutes', 'extra_messages', 'extra_mb', 'extra_minutes_cost', 'extra_messages_cost', 'extra_mb_cost'], axis=1)

# Extract region from city
# df_fe['region'] = df_fe['city'].str.split(',').str[1].str.strip()
# df_fe = pd.get_dummies(df_fe, columns=['region'], prefix='region')
df_fe = df_fe.drop('city', axis=1)

# One-hot encode plan
df_fe = pd.get_dummies(df_fe, columns=['plan'], prefix='plan')

# User behavior patterns
df_fe['calls_per_session'] = df_fe['total_calls'] / np.maximum(1, df_fe['total_calls'])
df_fe['mb_per_session'] = df_fe['total_mb'] / np.maximum(1, df_fe['sessions_count'])
df_fe['messages_per_session'] = df_fe['total_messages'] / np.maximum(1, df_fe['total_messages'])
df_fe = df_fe.drop(['total_duration', 'total_messages', 'total_mb'], axis=1)

# Age groups
df_fe['age_group'] = pd.cut(df_fe['age'], bins=[0, 18, 25, 35, 50, 100], labels=['<18', '18-25', '26-35', '36-50', '50+'])
df_fe = pd.get_dummies(df_fe, columns=['age_group'], prefix='age')
df_fe = df_fe.drop('age', axis=1)

df_fe['lag1_total_cost'] = df_fe['total_cost'].shift(1)
df_fe['lag1_total_cost'] = df_fe['lag1_total_cost'].fillna(0)

# Display shape comparison
print(f"Original DataFrame: {df.shape[1]} columns")
print(f"Feature Engineered DataFrame: {df_fe.shape[1]} columns")

Original DataFrame: 22 columns
Feature Engineered DataFrame: 24 columns


In [68]:
print(df_fe.columns)
df_fe.to_csv('../data/ml/dataset_ml.csv', index=False)

Index(['user_id', 'year_month', 'total_calls', 'avg_duration',
       'sessions_count', 'first_name', 'last_name', 'usd_monthly_pay',
       'is_churned', 'minutes_usage_ratio', 'messages_usage_ratio',
       'mb_usage_ratio', 'total_cost', 'plan_surf', 'plan_ultimate',
       'calls_per_session', 'mb_per_session', 'messages_per_session',
       'age_<18', 'age_18-25', 'age_26-35', 'age_36-50', 'age_50+',
       'lag1_total_cost'],
      dtype='object')
