# Entrenamiento y Evaluacion de Modelos de Regresion

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

In [10]:
# Cargar el dataset
file_path = r'C:\Users\Paul Tandazo\Desktop\DataMining Prueba Practica\data\ml\analytic_df_processed.csv'
df = pd.read_csv(file_path)

# Mostrar las primeras filas del dataset para conocer su estructura
df.head()


Unnamed: 0,user_id,first_name,last_name,age,city,reg_date,plan,churn_date,total_calls,total_call_duration,...,total_sms,total_mb,extra_usage,log_total_calls,log_total_mb_used,log_total_messages,total_calls_scaled,total_mb_scaled,total_messages_scaled,region
0,1000,Anamaria,Bauer,45,"Atlanta-Sandy Springs-Roswell, GA MSA",2018-12-24,ultimate,Activo,15.0,110.98,...,11.0,1901.47,-28818.53,2.772589,7.550908,2.484907,1.0,0.203518,0.733333,"Atlanta-Sandy Springs-Roswell, GA MSA"
1,1001,Mickey,Wilkerson,28,"Seattle-Tacoma-Bellevue, WA MSA",2018-08-13,surf,Activo,4.0,13.46,...,4.0,1724.33,-13635.67,1.609438,7.453174,1.609438,0.266667,0.184558,0.266667,"Seattle-Tacoma-Bellevue, WA MSA"
2,1002,Carlee,Hoffman,36,"Las Vegas-Henderson-Paradise, NV MSA",2018-10-21,surf,Activo,7.0,22.31,...,5.0,1646.11,-13713.89,2.079442,7.406778,1.791759,0.466667,0.176186,0.333333,"Las Vegas-Henderson-Paradise, NV MSA"
3,1003,Reynaldo,Jenkins,52,"Tulsa, OK MSA",2018-01-28,surf,Activo,15.0,84.56,...,15.0,6135.25,-9224.75,2.772589,8.721969,2.772589,1.0,0.656667,1.0,"Tulsa, OK MSA"
4,1004,Leonila,Thompson,40,"Seattle-Tacoma-Bellevue, WA MSA",2018-05-23,surf,Activo,1.0,11.08,...,3.0,497.08,-14862.92,0.693147,6.210761,1.386294,0.066667,0.053203,0.2,"Seattle-Tacoma-Bellevue, WA MSA"


In [11]:
# Preprocesamiento de los datos
X = df[['total_calls', 'total_call_duration', 'total_internet_sessions', 'total_mb_used', 'total_messages']]
y = df['usd_monthly_pay']  # Variable objetivo: el costo mensual del usuario



In [12]:
# Separar los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Entrenar varios modelos de regresion ( Se usó Regresion Lineal, Gradient Descent)

In [13]:
# Modelo 1: Regresión Lineal
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Modelo 2: Regresión por Gradient Descent (SGD)
sgd_model = SGDRegressor(max_iter=1000, tol=1e-3)
sgd_model.fit(X_train, y_train)

y_pred_linear = linear_model.predict(X_test)
y_pred_sgd = sgd_model.predict(X_test)

def evaluate_model(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, r2

# Evaluación del modelo de Regresión Lineal
rmse_linear, mae_linear, r2_linear = evaluate_model(y_test, y_pred_linear)
# Evaluación del modelo de Regresión por SGD
rmse_sgd, mae_sgd, r2_sgd = evaluate_model(y_test, y_pred_sgd)



In [14]:
print("Modelo de Regresión Lineal:")
print(f"RMSE: {rmse_linear}")
print(f"MAE: {mae_linear}")
print(f"R²: {r2_linear}")



Modelo de Regresión Lineal:
RMSE: 23.257483947726335
MAE: 21.631971557814218
R²: 0.005679118421418994


In [15]:
print("\nModelo de Regresión por Gradient Descent (SGD):")
print(f"RMSE: {rmse_sgd}")
print(f"MAE: {mae_sgd}")
print(f"R²: {r2_sgd}")


Modelo de Regresión por Gradient Descent (SGD):
RMSE: 2389675213572983.5
MAE: 1710199527718797.0
R²: -1.0497330195524054e+28


In [16]:
if r2_linear > r2_sgd:
    print("\nEl mejor modelo es la Regresión Lineal.")
else:
    print("\nEl mejor modelo es el Gradient Descent (SGD).")



El mejor modelo es la Regresión Lineal.
