<a href="https://colab.research.google.com/github/Luiscontreras7/Proyecto-IA-1-/blob/main/99_Modelo_solucion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:

import os
!kaggle
!chmod 600 ./kaggle.json
os.environ['KAGGLE_CONFIG_DIR'] = '.'


usage: kaggle [-h] [-v] [-W]
              {competitions,c,datasets,d,kernels,k,models,m,files,f,config}
              ...
kaggle: error: the following arguments are required: command


In [3]:
!kaggle competitions download -c udea-ai-4-eng-20251-pruebas-saber-pro-colombia

Downloading udea-ai-4-eng-20251-pruebas-saber-pro-colombia.zip to /content
  0% 0.00/29.9M [00:00<?, ?B/s]
100% 29.9M/29.9M [00:00<00:00, 1.25GB/s]


In [4]:
!unzip udea-ai-4-eng-20251-pruebas-saber-pro-colombia.zip

Archive:  udea-ai-4-eng-20251-pruebas-saber-pro-colombia.zip
  inflating: submission_example.csv  
  inflating: test.csv                
  inflating: train.csv               


El paradigma a seguir es supervised learning

In [5]:
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.4.0


In [6]:
import pandas as pd
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, StackingClassifier
from sklearn.model_selection import train_test_split


class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, config=None, max_features_text=50, auto_detect=False):
        self.config = config or {'binary': [], 'ordinal': {}, 'onehot': [], 'label': [], 'text': []}
        self.auto_detect = auto_detect
        self.max_features_text = max_features_text
        self.mappings_ = {}

    def fit(self, X, y=None):
        df = X.copy()
        if self.auto_detect:
            binary, onehot, label, text = [], [], [], []
            for col in df.columns:
                vals = df[col].dropna().unique()
                n_unique = len(vals)
                dtype = df[col].dtype
                if set(vals).issubset({'Si','No','S','N'}) and n_unique == 2:
                    binary.append(col)
                elif dtype == object and n_unique > self.max_features_text:
                    text.append(col)
                elif dtype == object and n_unique <= 10:
                    onehot.append(col)
                elif dtype == object:
                    label.append(col)
            self.config = {'binary': binary, 'ordinal': {}, 'onehot': onehot, 'label': label, 'text': text}

        self.mappings_['binary'] = {}
        for col in self.config['binary']:
            vals = df[col].dropna().unique()
            self.mappings_['binary'][col] = {'Si':1,'No':0} if 'Si' in vals else {'S':1,'N':0}

        self.mappings_['ordinal'] = {}
        for col, order in self.config.get('ordinal',{}).items():
            mapping = {v:i for i,v in enumerate(order)} if order else {v:i for i,v in enumerate(sorted(df[col].dropna().unique()))}
            self.mappings_['ordinal'][col] = mapping

        self.ohe_ = {}
        for col in self.config['onehot']:
            ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
            ohe.fit(df[[col]].astype(str))
            self.ohe_[col] = ohe

        self.le_ = {}
        for col in self.config['label']:
            le = LabelEncoder()
            le.fit(df[col].dropna().astype(str))
            self.le_[col] = le

        self.tfidf_ = {}
        for col in self.config['text']:
            tfidf = TfidfVectorizer(max_features=self.max_features_text, lowercase=True,
                                     ngram_range=(1,2), min_df=2, max_df=0.95)
            tfidf.fit(df[col].fillna('').astype(str))
            self.tfidf_[col] = tfidf

        return self

    def transform(self, X):
        df = X.copy()
        for col, m in self.mappings_['binary'].items():
            df[col] = df[col].map(m).fillna(0).astype(int)
        for col, m in self.mappings_['ordinal'].items():
            df[col] = df[col].map(m).fillna(-1).astype(int)
        for col, ohe in self.ohe_.items():
            arr = ohe.transform(df[[col]].astype(str))
            cols = [f"{col}_oh_{cat}" for cat in ohe.categories_[0]]
            df_oh = pd.DataFrame(arr, columns=cols, index=df.index)
            df = pd.concat([df, df_oh], axis=1)
            df.drop(columns=[col], inplace=True)
        for col, le in self.le_.items():
            df[col] = df[col].fillna('').apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
        for col, tfidf in self.tfidf_.items():
            arr = tfidf.transform(df[col].fillna('').astype(str)).toarray()
            cols = [f"{col}_tfidf_{i}" for i in range(arr.shape[1])]
            df_t = pd.DataFrame(arr, columns=cols, index=df.index)
            df = pd.concat([df, df_t], axis=1)
            df.drop(columns=[col], inplace=True)
        for col in df.columns:
            if df[col].isnull().any():
                vals = df[col].dropna().values
                df.loc[df[col].isnull(), col] = np.random.choice(vals, size=df[col].isnull().sum())
        return df


def load_data(path):
    df = pd.read_csv(path)
    print(f"Loaded {path} with shape {df.shape}")
    return df


def ensemble_pipeline(train_path, test_path, id_col, target_col,
                      method='gb', auto_detect=False, sample_frac=None,
                      output_submission=None):
    """
    method: 'bag' (RandomForest), 'gb' (HistGradientBoosting), 'stack' (stacking RF + SVM).
    """
    train = load_data(train_path)
    test = load_data(test_path)

    if sample_frac:
        train = train.sample(frac=sample_frac, random_state=42).reset_index(drop=True)
        test = test.sample(frac=sample_frac, random_state=42).reset_index(drop=True)

    cols = [c for c in train.columns if c not in [id_col, target_col]]
    all_df = pd.concat([train[cols], test[cols]], ignore_index=True)
    pre = Preprocessor(auto_detect=auto_detect)
    pre.fit(all_df)
    all_p = pre.transform(all_df)

    n = len(train)
    X_train, y_train = all_p.iloc[:n].values, train[target_col].values
    X_test = all_p.iloc[n:].values

    if method == 'bag':
        # Bagging: Random Forest rápido y robusto
        model = RandomForestClassifier(
            n_estimators=100, max_depth=10, max_features='sqrt',
            n_jobs=-1, random_state=42
        )
    elif method == 'gb':
        # Boosting: HistGradientBoosting eficiente en grandes datos
        model = HistGradientBoostingClassifier(
            learning_rate=0.1, max_iter=200, max_depth=8,
            early_stopping=True, random_state=42
        )
    elif method == 'stack':
        # Stacking RF + SVM con meta-regresor ligero
        base_estimators = [
            ('rf', RandomForestClassifier(n_estimators=50, max_depth=8, random_state=42)),
            ('svm', LinearSVC(C=0.5, max_iter=2000, dual=False))
        ]
        model = StackingClassifier(
            estimators=base_estimators,
            final_estimator=LogisticRegression(max_iter=1000),
            cv=5, n_jobs=-1
        )
    else:
        raise ValueError("Método no soportado: elige 'bag','gb' o 'stack'.")

    model.fit(X_train, y_train)
    print(f"{method} train accuracy:", accuracy_score(y_train, model.predict(X_train)))
    print(classification_report(y_train, model.predict(X_train)))

    preds = model.predict(X_test)
    submission = pd.DataFrame({id_col: test[id_col], target_col: preds})
    if output_submission:
        submission.to_csv(output_submission, index=False)
        print(f"Saved submission to {output_submission}")
    return submission, model


La siguiente celda tarda, aprox 15 minutos por la robustez del modelo

In [10]:
sub_gb, gb_model = ensemble_pipeline(
    'train.csv', 'test.csv',
    id_col='ID', target_col='RENDIMIENTO_GLOBAL',
    method='gb', auto_detect=True, #sample_frac=0.1,
    output_submission='submission_gb.csv'
)

Loaded train.csv with shape (692500, 21)
Loaded test.csv with shape (296786, 20)
gb train accuracy: 0.4485415162454874
              precision    recall  f1-score   support

        alto       0.54      0.64      0.59    175619
        bajo       0.47      0.58      0.52    172987
  medio-alto       0.35      0.28      0.31    171619
  medio-bajo       0.36      0.29      0.32    172275

    accuracy                           0.45    692500
   macro avg       0.43      0.45      0.44    692500
weighted avg       0.43      0.45      0.44    692500

Saved submission to submission_gb.csv


In [11]:
!kaggle competitions submit -c udea-ai-4-eng-20251-pruebas-saber-pro-colombia -f submission_gb.csv -m "Message"

100% 4.03M/4.03M [00:00<00:00, 6.36MB/s]
Successfully submitted to UDEA/ai4eng 20251 - Pruebas Saber Pro Colombia