<a href="https://colab.research.google.com/github/Scanner20/ml/blob/main/TF_ML_Grupo3_RF_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
pd.set_option("display.max_columns",None)
path = "https://raw.githubusercontent.com/amankharwal/Website-data/master/marketing_campaign.csv"
data = pd.read_csv(path, sep=';')


data.head()


In [None]:
data.info()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score
)
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Data Loading
path = "https://raw.githubusercontent.com/amankharwal/Website-data/master/marketing_campaign.csv"
data = pd.read_csv(path, sep=';')

# 2. Initial Data Exploration
def explore_data(df):
    print("Dataset Information:")
    print(df.info())

    print("\nMissing Values:")
    print(df.isnull().sum())

    print("\nDescriptive Statistics:")
    print(df.describe())

# 3. Data Preprocessing
def preprocess_data(df):
    # Create a copy of the dataframe
    df_processed = df.copy()

    # Handle missing values
    # Replace income NaNs with median
    df_processed['Income'] = df_processed['Income'].fillna(df_processed['Income'].median())

    # Calculate age
    df_processed['Age'] = 2024 - df_processed['Year_Birth']

    # Categorize age groups
    def categorize_age(age):
        if age < 30:
            return 'Young'
        elif 30 <= age < 45:
            return 'Middle'
        else:
            return 'Senior'

    df_processed['AgeGroup'] = df_processed['Age'].apply(categorize_age)

    # Select features for modeling
    features = [
        'Income', 'Kidhome', 'Teenhome', 'Recency',
        'NumDealsPurchases', 'NumWebPurchases',
        'NumCatalogPurchases', 'NumStorePurchases',
        'NumWebVisitsMonth', 'Education', 'Marital_Status',
        'AgeGroup'
    ]

    # Prepare X and y
    X = df_processed[features]
    y = df_processed['MntWines']

    return X, y

# 4. Model Preparation and Training
def prepare_and_train_model(X, y):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Preprocessing for numerical and categorical columns
    numeric_features = [
        'Income', 'Kidhome', 'Teenhome', 'Recency',
        'NumDealsPurchases', 'NumWebPurchases',
        'NumCatalogPurchases', 'NumStorePurchases',
        'NumWebVisitsMonth'
    ]

    categorical_features = [
        'Education', 'Marital_Status', 'AgeGroup'
    ]

    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            # Numeric columns: impute missing values and scale
            ('num', Pipeline([
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ]), numeric_features),

            # Categorical columns: one-hot encoding
            ('cat', Pipeline([
                ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))
            ]), categorical_features)
        ])

    # Create a pipeline with preprocessor and random forest
    rf_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
    ])

    # Train the model
    rf_pipeline.fit(X_train, y_train)

    # Predictions
    y_pred = rf_pipeline.predict(X_test)

    # Evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("\nModel Evaluation:")
    print(f"Mean Squared Error: {mse}")
    print(f"Mean Absolute Error: {mae}")
    print(f"R-squared Score: {r2}")

    # Feature importance
    feature_names = (
        numeric_features +
        list(rf_pipeline.named_steps['preprocessor']
             .named_transformers_['cat']
             .named_steps['onehot']
             .get_feature_names_out(categorical_features))
    )

    feature_importance = rf_pipeline.named_steps['regressor'].feature_importances_

    # Plot feature importance
    plt.figure(figsize=(10, 6))
    feature_imp = pd.Series(feature_importance, index=feature_names).sort_values(ascending=False)
    sns.barplot(x=feature_imp, y=feature_imp.index)
    plt.title('Feature Importance in Predicting Wine Purchases')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()

    return rf_pipeline

# Main Execution
if __name__ == '__main__':
    # Explore the data
    explore_data(data)

    # Preprocess the data
    X, y = preprocess_data(data)

    # Eliminar filas con NaNs
    X = X.dropna()
    y = y[X.index]

    # Train and evaluate the model
    model = prepare_and_train_model(X, y)

In [None]:
#

# COMPARATIVA DE MODELOS

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
import warnings
warnings.filterwarnings('ignore')

# A. Ingeniería de características mejorada
def advanced_feature_engineering(df):
    # Copia del dataframe original
    df_processed = df.copy()

    # Características de interacción
    df_processed['Income_per_Family_Member'] = df_processed['Income'] / (df_processed['Kidhome'] + df_processed['Teenhome'] + 1)

    # Características de compras
    df_processed['Total_Purchases'] = (
        df_processed['NumDealsPurchases'] +
        df_processed['NumWebPurchases'] +
        df_processed['NumCatalogPurchases'] +
        df_processed['NumStorePurchases']
    )

    # Características de tiempo
    df_processed['Customer_Age'] = 2024 - df_processed['Year_Birth']
    df_processed['Days_Since_First_Purchase'] = pd.to_datetime('2024-01-01') - pd.to_datetime(df_processed['Dt_Customer'])
    df_processed['Days_Since_First_Purchase'] = df_processed['Days_Since_First_Purchase'].dt.days

    # Categorización de ingresos
    def categorize_income(income):
        if pd.isna(income):
            return 'Unknown'
        elif income < df_processed['Income'].quantile(0.25):
            return 'Low'
        elif income < df_processed['Income'].quantile(0.75):
            return 'Medium'
        else:
            return 'High'

    df_processed['Income_Category'] = df_processed['Income'].apply(categorize_income)

    # Características de productos
    product_columns = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
    df_processed['Total_Product_Spending'] = df_processed[product_columns].sum(axis=1)
    df_processed['Spending_Diversity'] = (df_processed[product_columns] > 0).sum(axis=1)

    # Selección de características
    features = [
        'Income', 'Kidhome', 'Teenhome', 'Recency',
        'NumDealsPurchases', 'NumWebPurchases',
        'NumCatalogPurchases', 'NumStorePurchases',
        'NumWebVisitsMonth', 'Education', 'Marital_Status',
        'Income_per_Family_Member', 'Total_Purchases',
        'Customer_Age', 'Days_Since_First_Purchase',
        'Income_Category', 'Total_Product_Spending',
        'Spending_Diversity'
    ]

    # Eliminar filas con valores NaN en características clave
    df_processed_cleaned = df_processed.dropna(subset=features + ['MntWines'])

    X = df_processed_cleaned[features]
    y = df_processed_cleaned['MntWines']

    return X, y

# B. Simulación de recopilación de más datos (técnica de bootstrapping)
def bootstrap_data(X, y, n_iterations=5):
    bootstrapped_datasets = []

    for _ in range(n_iterations):
        # Muestreo con reemplazo
        indices = np.random.randint(0, len(X), len(X))
        X_bootstrap = X.iloc[indices]
        y_bootstrap = y.iloc[indices]

        bootstrapped_datasets.append((X_bootstrap, y_bootstrap))

    return bootstrapped_datasets

# C. Comparación de múltiples algoritmos y métodos de ensemble
def compare_models(X, y):
    # Preparación de datos
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Definición de modelos a comparar
    models = {
        'Random Forest': RandomForestRegressor(random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(random_state=42),
        'HistGradient Boosting': HistGradientBoostingRegressor(random_state=42),
        'Linear Regression': LinearRegression()
    }

    # Resultados de los modelos
    results = {}

    for name, model in models.items():
        # Preprocesamiento específico para cada modelo
        if name in ['Random Forest', 'Gradient Boosting', 'HistGradient Boosting']:
            preprocessor = ColumnTransformer(
                transformers=[
                    ('num', Pipeline([
                        ('imputer', SimpleImputer(strategy='median')),
                        ('scaler', StandardScaler())
                    ]), X.select_dtypes(include=['int64', 'float64']).columns),
                    ('cat', Pipeline([
                        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                        ('onehot', OneHotEncoder(handle_unknown='ignore'))
                    ]), X.select_dtypes(include=['object']).columns)
                ])

            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('regressor', model)
            ])

        else:
            preprocessor = ColumnTransformer(
                transformers=[
                    ('num', Pipeline([
                        ('imputer', SimpleImputer(strategy='median')),
                        ('scaler', StandardScaler())
                    ]), X.select_dtypes(include=['int64', 'float64']).columns),
                    ('cat', Pipeline([
                        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                        ('onehot', OneHotEncoder(handle_unknown='ignore'))
                    ]), X.select_dtypes(include=['object']).columns)
                ])

            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('regressor', model)
            ])

        # Entrenamiento y evaluación
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)

        results[name] = {
            'MSE': mean_squared_error(y_test, y_pred),
            'MAE': mean_absolute_error(y_test, y_pred),
            'R2': r2_score(y_test, y_pred)
        }

    # Visualización de resultados
    plt.figure(figsize=(10, 6))
    metrics = ['MSE', 'MAE', 'R2']
    for metric in metrics:
        values = [results[model][metric] for model in results]
        plt.bar([f"{model}\n{metric}" for model in results], values)

    plt.title('Comparación de Modelos')
    plt.ylabel('Valor de Métrica')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    return results

# Ejecución principal
if __name__ == '__main__':
    # Carga de datos
    path = "https://raw.githubusercontent.com/amankharwal/Website-data/master/marketing_campaign.csv"
    data = pd.read_csv(path, sep=';')

    # A. Ingeniería de características
    X, y = advanced_feature_engineering(data)

    # B. Bootstrapping de datos
    bootstrapped_datasets = bootstrap_data(X, y)

    # Resultados de bootstrapping
    print("Número de conjuntos de datos bootstrap:", len(bootstrapped_datasets))

    # C. Comparación de modelos
    model_comparison_results = compare_models(X, y)

    # Impresión de resultados de comparación
    for model, metrics in model_comparison_results.items():
        print(f"\nResultados para {model}:")
        for metric, value in metrics.items():
            print(f"{metric}: {value}")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import warnings
warnings.filterwarnings('ignore')

def advanced_feature_engineering(df):
    # Same as previous implementation
    df_processed = df.copy()

    # Características de interacción
    df_processed['Income_per_Family_Member'] = df_processed['Income'] / (df_processed['Kidhome'] + df_processed['Teenhome'] + 1)

    # Características de compras
    df_processed['Total_Purchases'] = (
        df_processed['NumDealsPurchases'] +
        df_processed['NumWebPurchases'] +
        df_processed['NumCatalogPurchases'] +
        df_processed['NumStorePurchases']
    )

    # Características de tiempo
    df_processed['Customer_Age'] = 2024 - df_processed['Year_Birth']
    df_processed['Days_Since_First_Purchase'] = pd.to_datetime('2024-01-01') - pd.to_datetime(df_processed['Dt_Customer'])
    df_processed['Days_Since_First_Purchase'] = df_processed['Days_Since_First_Purchase'].dt.days

    # Categorización de ingresos
    def categorize_income(income):
        if pd.isna(income):
            return 'Unknown'
        elif income < df_processed['Income'].quantile(0.25):
            return 'Low'
        elif income < df_processed['Income'].quantile(0.75):
            return 'Medium'
        else:
            return 'High'

    df_processed['Income_Category'] = df_processed['Income'].apply(categorize_income)

    # Características de productos
    product_columns = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
    df_processed['Total_Product_Spending'] = df_processed[product_columns].sum(axis=1)
    df_processed['Spending_Diversity'] = (df_processed[product_columns] > 0).sum(axis=1)

    # Selección de características
    features = [
        'Income', 'Kidhome', 'Teenhome', 'Recency',
        'NumDealsPurchases', 'NumWebPurchases',
        'NumCatalogPurchases', 'NumStorePurchases',
        'NumWebVisitsMonth', 'Education', 'Marital_Status',
        'Income_per_Family_Member', 'Total_Purchases',
        'Customer_Age', 'Days_Since_First_Purchase',
        'Income_Category', 'Total_Product_Spending',
        'Spending_Diversity'
    ]

    # Eliminar filas con valores NaN en características clave
    df_processed_cleaned = df_processed.dropna(subset=features + ['MntWines'])

    X = df_processed_cleaned[features]
    y = df_processed_cleaned['MntWines']

    return X, y

def compare_models(X, y):
    # Preparación de datos
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Definición de modelos a comparar - AÑADIDOS SVR y KNeighborsRegressor
    models = {
        'Random Forest': RandomForestRegressor(random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(random_state=42),
        'HistGradient Boosting': HistGradientBoostingRegressor(random_state=42),
        'Linear Regression': LinearRegression(),
        'Support Vector Machine': SVR(kernel='rbf'),  # Añadido SVM
        'K-Nearest Neighbors': KNeighborsRegressor()  # Añadido KNN
    }

    # Resultados de los modelos
    results = {}

    for name, model in models.items():
        # Preprocesamiento específico para cada modelo
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', Pipeline([
                    ('imputer', SimpleImputer(strategy='median')),
                    ('scaler', StandardScaler())
                ]), X.select_dtypes(include=['int64', 'float64']).columns),
                ('cat', Pipeline([
                    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                    ('onehot', OneHotEncoder(handle_unknown='ignore'))
                ]), X.select_dtypes(include=['object']).columns)
            ])

        # Pipeline para cada modelo
        if name == 'Support Vector Machine':
            # Para SVM, usamos un hiperplano con kernel RBF y escalado estándar
            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('regressor', model)
            ])
        elif name == 'K-Nearest Neighbors':
            # Para KNN, necesitamos asegurar escalado
            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('regressor', model)
            ])
        else:
            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('regressor', model)
            ])

        # Entrenamiento y evaluación
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)

        results[name] = {
            'MSE': mean_squared_error(y_test, y_pred),
            'MAE': mean_absolute_error(y_test, y_pred),
            'R2': r2_score(y_test, y_pred)
        }

    # Visualización de resultados
    plt.figure(figsize=(12, 6))
    metrics = ['MSE', 'MAE', 'R2']

    # Preparar datos para gráfico
    model_names = list(results.keys())
    metric_values = {metric: [results[model][metric] for model in model_names] for metric in metrics}

    # Crear subplots para cada métrica
    for i, metric in enumerate(metrics, 1):
        plt.subplot(1, 3, i)
        plt.bar(model_names, metric_values[metric])
        plt.title(f'{metric} por Modelo')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()

    plt.show()

    return results

# Ejecución principal
if __name__ == '__main__':
    # Carga de datos
    path = "https://raw.githubusercontent.com/amankharwal/Website-data/master/marketing_campaign.csv"
    data = pd.read_csv(path, sep=';')

    # A. Ingeniería de características
    X, y = advanced_feature_engineering(data)

    # C. Comparación de modelos
    model_comparison_results = compare_models(X, y)

    # Impresión de resultados de comparación
    for model, metrics in model_comparison_results.items():
        print(f"\nResultados para {model}:")
        for metric, value in metrics.items():
            print(f"{metric}: {value}")