In [None]:
# Importación de librerías y configuración
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np
import json

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, FunctionTransformer
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from joblib import dump, load
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Variables globales
global df_traffic, resultados, modelo, modelo_clasificacion


In [None]:
import pandas as pd

# Cargar el DataFrame desde la URL
df = pd.read_csv(
    'https://raw.githubusercontent.com/ElProfeAlejo/Bootcamp_Databases/main/traffic_site.csv',
    dtype={'date': object, 'fullVisitorId': object, 'visitId': object}
)

df.head(3)


Unnamed: 0,channelGrouping,date,device,fullVisitorId,geoNetwork,sessionId,socialEngagementType,totals,trafficSource,visitId,visitNumber,visitStartTime
0,Organic Search,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",9674781571160116268,"{""continent"": ""Asia"", ""subContinent"": ""Southea...",9674781571160116268_1472804607,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1472804607,1,1472804607
1,Organic Search,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",8590648239310839049,"{""continent"": ""Europe"", ""subContinent"": ""Easte...",8590648239310839049_1472835928,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1472835928,1,1472835928
2,Affiliates,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",9772828344252850532,"{""continent"": ""Americas"", ""subContinent"": ""Sou...",9772828344252850532_1472856802,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"",...","{""campaign"": ""Data Share Promo"", ""source"": ""Pa...",1472856802,1,1472856802


In [None]:
def preprocesamiento():

  global df_traffic, df

  df_traffic = df.copy()

  diccionarios = ["device","geoNetwork","trafficSource","totals"]
  for columna in diccionarios:
      df_traffic = df_traffic.join(pd.DataFrame([json.loads(linea) for linea in df_traffic[columna]]))
      df_traffic = df_traffic.drop(columna,axis=1)

  df_traffic = df_traffic.drop("adwordsClickInfo",axis=1)

  columnas = [
      'channelGrouping', 'date', 'fullVisitorId', 'sessionId',
      'socialEngagementType', 'visitId', 'visitNumber', 'visitStartTime',
      'browser', 'browserVersion', 'browserSize', 'operatingSystem',
      'operatingSystemVersion', 'isMobile', 'mobileDeviceBranding',
      'mobileDeviceModel', 'mobileInputSelector', 'mobileDeviceInfo',
      'mobileDeviceMarketingName', 'flashVersion', 'language', 'screenColors',
      'screenResolution', 'deviceCategory', 'continent', 'subContinent',
      'country', 'region', 'metro', 'city', 'cityId', 'networkDomain',
      'latitude', 'longitude', 'networkLocation', 'campaign', 'source',
        'medium', 'keyword', 'isTrueDirect', 'referralPath', 'adContent',
        'campaignCode', 'visits', 'hits', 'pageviews', 'bounces', 'newVisits',
        'transactionRevenue']

  for columna in columnas:
    df_tamaño = len(df_traffic[columna].unique())

  columnas_eliminar = ["socialEngagementType","browserVersion","browserSize",
                      "operatingSystemVersion","mobileDeviceBranding","mobileDeviceModel",
                      "mobileInputSelector","mobileDeviceInfo","mobileDeviceMarketingName",
                      "flashVersion","language","screenColors",
                      "screenResolution","latitude","longitude",
                      "networkLocation","visits"]

  df_traffic = df_traffic.drop(columnas_eliminar,axis=1)

  columnas_mantener = ["channelGrouping","date","fullVisitorId",
                      "sessionId","visitId","referralPath",
                      "city","metro","visitNumber",
                      "visitStartTime","browser","operatingSystem",
                      "isMobile","deviceCategory","campaign",
                      "source","medium","hits",
                      "pageviews","bounces","newVisits","transactionRevenue"]

  df_traffic = df_traffic[columnas_mantener]

  nulos = df_traffic.isnull().sum()

  if nulos.any() > 0:
    df_traffic = df_traffic.fillna(0)
  else:
    df_traffic = df_traffic
  cuant = ['bounces', 'hits','newVisits','pageviews', 'transactionRevenue']
  for columna in cuant:
      df_traffic[columna] = pd.to_numeric(df_traffic[columna])

  df_traffic["transactionRevenue"] = df_traffic["transactionRevenue"]/1000000

  df_traffic['date'] = pd.to_datetime(df_traffic['date'], format='%Y%m%d')

  df_traffic['visitStartTime'] = pd.to_datetime(df_traffic['visitStartTime'], unit='s')

  df_traffic['date'] = pd.to_datetime(df_traffic['date'], format='%Y%m%d')

  df_traffic['visitStartTime'] = pd.to_datetime(df_traffic['visitStartTime'], unit='s')

  # df_traffic['interaction'] = df_traffic['source'] + '_' + df_traffic['deviceCategory'] # Variable extra, que agregue.

  return df_traffic

df_traffic = preprocesamiento()

In [None]:
def feature_engineering():

    # Volvemos a definir las variables globales.

    global df_traffic

    # Usando lambda y .year estaremos extrayendo de la columna date el año y almacenando
    # ese año en la nueva columna llamada year.

    df_traffic['year'] = df_traffic['date'].apply(lambda x:x.year)

    # Usando lambda y .month estaremos extrayendo de la columna date el mes y almacenando
    # ese mes en la nueva columna llamada month.

    df_traffic['month'] = df_traffic['date'].apply(lambda x:x.month)

    # Usando lambda y .quarterMonth estaremos extrayendo de la columna date el cuarto de mes y almacenando
    # ese cuarto de mes en la nueva columna llamada quarterMonth.

    df_traffic['quarterMonth'] = df_traffic['date'].apply(lambda x: 1 if x.day <= 7 else 2 if x.day <= 14 else 3 if x.day <= 21 else 4)

    # Usando lambda y .day estaremos extrayendo de la columna date el día y almacenando
    # ese día en la nueva columna llamada day.

    df_traffic['day'] = df_traffic['date'].apply(lambda x:x.day)

    # Usando lambda y .weekday estaremos extrayendo de la columna date el día de fin de semana y almacenando
    # ese día en la nueva columna llamada weekday.

    df_traffic['weekday'] = df_traffic['date'].apply(lambda x:x.weekday)

    df_traffic['time_range'] = (
        pd.cut(
            df_traffic['visitStartTime'].dt.hour,
            bins=[0, 6, 12, 18, 24],
            labels=['madrugada', 'mañana', 'tarde', 'noche'],
            ordered=False
        )
        .astype('object')
    )

    eliminar_columna = ['date','fullVisitorId','sessionId','visitId','visitStartTime','year']

    df_traffic  = df_traffic.drop(eliminar_columna,axis=1)

    cualitativas = df_traffic.dtypes[df_traffic.dtypes == object].keys()

    for columna in cualitativas:
        lbl = LabelEncoder()
        strings = list(df_traffic[columna].values.astype('str'))
        lbl.fit(strings)
        df_traffic[columna] = lbl.transform(strings)
    df_traffic.reset_index(inplace=True, drop=True)

    #Columnas que no son necesarios y que puede no aportar nada al modelo
    columnas_sin_aporte = ["referralPath"]

    df_traffic.drop(columnas_sin_aporte,axis = 1, inplace=True)

    return df_traffic

df_traffic = feature_engineering()

In [None]:
def crea_modelos():
    global df_traffic, resultados, modelo, modelo_clasificacion
    """
        Como nuestra base tiene muchos 0 y pocos valores mayores a 1. Vamos a aplicar el método de de Gamma Hurdle
        en donde primero convertiremos nuestra variable en binario, donde 0 == 0 y 1 serán los > 0. Esto nos va a
        permitir trabajar nuestros datos dentro de un modelo de clasificación, para que de esta forma podamos
        crear una nueva variable predictora a partir de las predicciones realizadas con nuestro modelo de clasificación
        entrenado. Esto con el fin de generar un variable que nos permita añadir valor a nuestros datos y predicir de
        manera más precisa.

    """

    # Modelo de clasificación

    ## Eliminando nuestra columna transactionRevenue en X para quedarnos solo con las variables predictoras

    X = df_traffic.drop('transactionRevenue',axis=1)

    ## Seleccionando solamente nuestra variable transactionRevenue y creando la copy sobnre la cual trabajremos
    ## guardando los resultados en y, esta varible y hace referencia a la variable respuesta o dependiente, que
    ## es lo que buscamos predecir.

    y = df_traffic.transactionRevenue.copy().apply(lambda x: 0 if x == 0 else 1)

    ## Ahora separamos nuestras variables en un porcentaje para test y otro para entrenamiento
    ## las que sean train, serán con las cuales entrenemos nuestro modelo y las test serán las
    ## que usaremos para entrenar el modelo.

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state = 42
    )

    ## creando nuestro modelo

    model_clasi = RandomForestClassifier(n_estimators=100, random_state=42)
    ## entrenando el modelo
    model_clasi.fit(X_train,y_train)
    modelo_clasificacion = model_clasi

    # Modelo de regresión

    ## Añdiendo una nueva columna a nuestras variables predictoras usando el modelo de clasificación
    ## para determinar las posibles compras que se harán, usando las variables predictoras del modelo
    ## anterior que serían lo mismo para este modelo sin la variable que estamos anexando en estos
    ## momentos

    X = df_traffic.drop('transactionRevenue',axis=1)
    X["clasificacion"] = model_clasi.predict(X)
    y = df_traffic.transactionRevenue.copy()
    train_x, test_x, train_y, test_y = train_test_split(
        X, y, test_size=0.30, random_state = 42
        )


    # Regresión lineal

    model = LinearRegression()
    model.fit(train_x, train_y)

    ## Predicciones

    predictions = model.predict(test_x)
    predictions[predictions < 0] = 0


    r2 = r2_score(test_y, predictions)*100
    mse = mean_squared_error(test_y, predictions)
    rmse = np.sqrt(mse)

    print(f"R-cuadrado (R²): {r2}")
    print(f"Error cuadrático medio (MSE): {mse}")
    print(f"Raíz del error cuadrático medio (RMSE): {rmse}")

    # LGB MODEL

    lgb_params = {
        "objective": "regression",
        "metric": "rmse",
        'subsample': 0.67,
        'reg_lambda': 0.031,
        'reg_alpha': 0.029,
        'num_leaves': 16,
        'min_child_samples': 14,
        'max_depth': 5,
        'learning_rate': 0.042,
        'colsample_bytree': 0.66,
        'verbosity': -1
    }

    lgb_train = lgb.Dataset(train_x, label = train_y)
    lgb_val = lgb.Dataset(test_x, label = test_y)
    lgb_model = lgb.train(
        lgb_params, lgb_train,
        num_boost_round = 700,
        valid_sets = [lgb_val],
        callbacks = [lgb.early_stopping(stopping_rounds = 500)]
    )
    # XGBoost Model
    params = {
        "objective": "reg:squarederror", # para problemas de regresión
        "eval_metric": "rmse", # métrica de evaluación: error cuadrático medio
        'subsample': 0.88,
        'reg_lambda': 0.095,
        'reg_alpha': 0.01,
        'random_state': 42,
        'n_estimators': 230,
        'min_child_weight': 5,
        'max_depth': 6,
        'learning_rate': 0.046,
        'gamma': 0.105,
        'colsample_bytree': 0.9
    }
    dtrain = xgb.DMatrix(train_x, label = train_y)
    dtest = xgb.DMatrix(test_x, label = test_y)
    evals = [(dtrain,"train"),(dtest,"test")]

    xgb_model = xgb.train(
        params,dtrain,num_boost_round = 1000,
        evals = evals, early_stopping_rounds = 10,
        verbose_eval = False
        )

    models = {
                "Regresión Lineal": LinearRegression(),
                "Random Forest": RandomForestRegressor(),
                "LightGBM": lgb_model,
                "XGBoost": xgb_model
            }

    results = {"Model": [], "R2": [], "MSE": [], "RMSE": []}

    for name, model in models.items():
        if name == "LightGBM":
            predictions = model.predict(
                test_x, num_iterations=model.best_iteration
                )
            predictions[predictions < 0] = 0
            modelo = model
        elif name == "Random Forest":
        #Definir el espacio de búsqueda de hiperparámetros
            param_dist = {
                "n_estimators": [
                    int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)
                    ],
                "max_features": ["sqrt"],
                "max_depth": [int(x) for x in np.linspace(10, 110, num = 11)] + [None],
                "min_samples_split": [2, 5, 10],
                "min_samples_leaf": [1, 2, 4],
                "bootstrap": [True, False]
            }
            #Realizar busqueda aleatoria
            random_search = RandomizedSearchCV(
                model, param_distributions=param_dist,
                n_iter = 50, cv = 5, scoring = "r2", random_state = 42
            )
            random_search.fit(train_x,train_y)

            #Obtener el mejor modelo
            predictions = random_search.best_estimator_.predict(test_x)
            predictions[predictions < 0] = 0

        elif name == "XGBoost":
            predictions = model.predict(dtest)
            predictions[predictions < 0] = 0
        else:
            model.fit(train_x, train_y)
            predictions = model.predict(test_x)
            predictions[predictions < 0] = 0

        r2  = r2_score(test_y, predictions) * 100
        mse = mean_squared_error(test_y, predictions)
        rmse = np.sqrt(mse)

        results["Model"].append(name)
        results["R2"].append(r2)
        results["MSE"].append(mse)
        results["RMSE"].append(rmse)

    dump(modelo, "modelo.joblib")
    dump(modelo_clasificacion, "modelo_clasificacion.joblib")
    resultados = results

In [None]:
crea_modelos()
for i, model in enumerate(resultados['Model']):
    print('-------------------------------')
    print(f"Modelo: {model}")
    print(f"R-cuadrado (R²): {resultados['R2'][i]}")
    print(f"Error cuadrático medio (MSE): {resultados['MSE'][i]}")
    print(f"Raíz del error cuadrático medio (RMSE): {resultados['RMSE'][i]}")
    print('-------------------------------')

R-cuadrado (R²): 31.194466651556763
Error cuadrático medio (MSE): 262.6771107898274
Raíz del error cuadrático medio (RMSE): 16.207316582020216
Training until validation scores don't improve for 500 rounds
Did not meet early stopping. Best iteration is:
[507]	valid_0's rmse: 14.0579
-------------------------------
Modelo: Regresión Lineal
R-cuadrado (R²): 31.194466651556763
Error cuadrático medio (MSE): 262.6771107898274
Raíz del error cuadrático medio (RMSE): 16.207316582020216
-------------------------------
-------------------------------
Modelo: Random Forest
R-cuadrado (R²): 39.767269972461115
Error cuadrático medio (MSE): 229.94894056693727
Raíz del error cuadrático medio (RMSE): 15.164067415008985
-------------------------------
-------------------------------
Modelo: LightGBM
R-cuadrado (R²): 50.251718676195445
Error cuadrático medio (MSE): 189.9227310501209
Raíz del error cuadrático medio (RMSE): 13.781245627668092
-------------------------------
-------------------------------

In [None]:
# Cargar la base de datos de prueba
df_traffic = pd.read_csv(
    'https://raw.githubusercontent.com/ElProfeAlejo/Bootcamp_Databases/main/traffic_test.csv',
    dtype={'date':object,'fullVisitorId':object,'visitId':object}
)

# Aplicar preprocesamiento y feature engineering
preprocesamiento()
feature_engineering()

# Cargar el modelo entrenado de regresión
modelo = load('modelo.joblib')

# Cargar el modelo entrenado de clasificación
modelo_clasificacion = load('modelo_clasificacion.joblib')

# Pronosticar con la nueva base
X = df_traffic.drop('transactionRevenue', axis=1)
X['clasificacion'] = modelo_clasificacion.predict(X)
y = df_traffic.transactionRevenue.copy()
predictions = modelo.predict(X)
predictions[predictions < 1] = 0

# Calcular métricas de evaluación de los pronósticos
r2 = r2_score(y, predictions) * 100
mse = mean_squared_error(y, predictions)
rmse = np.sqrt(mse)

# Imprimir métricas de evaluación
print(f"R-cuadrado (R²): {r2}")
print(f"Error cuadrático medio (MSE): {mse}")
print(f"Raíz del error cuadrático medio (RMSE): {rmse}")


R-cuadrado (R²): 79.25451957080296
Error cuadrático medio (MSE): 90.55803414139726
Raíz del error cuadrático medio (RMSE): 9.516198513135235


In [None]:
df_resultados = pd.DataFrame({
    'transactionRevenue': y,
    'predictions': predictions
})
df_resultados[df_resultados.transactionRevenue>0].sample(10)

Unnamed: 0,transactionRevenue,predictions
7189,102.35,160.267459
5900,702.9,347.046936
789,114.65,191.647149
5349,0.09,38.795829
6337,100.78,91.323425
6215,33.59,22.821609
5135,109.57,94.509819
6253,15.19,20.151941
10371,83.96,0.0
2599,37.8,72.000254
