<a href="https://colab.research.google.com/github/Luiscontreras7/Proyecto-IA-1-/blob/main/04_LinearSVC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import os
!kaggle
!chmod 600 ./kaggle.json
os.environ['KAGGLE_CONFIG_DIR'] = '.'


usage: kaggle [-h] [-v] [-W]
              {competitions,c,datasets,d,kernels,k,models,m,files,f,config}
              ...
kaggle: error: the following arguments are required: command


In [None]:
!kaggle competitions download -c udea-ai-4-eng-20251-pruebas-saber-pro-colombia

Downloading udea-ai-4-eng-20251-pruebas-saber-pro-colombia.zip to /content
  0% 0.00/29.9M [00:00<?, ?B/s]
100% 29.9M/29.9M [00:00<00:00, 999MB/s]


In [None]:
!unzip udea-ai-4-eng-20251-pruebas-saber-pro-colombia.zip

Archive:  udea-ai-4-eng-20251-pruebas-saber-pro-colombia.zip
  inflating: submission_example.csv  
  inflating: test.csv                
  inflating: train.csv               


El paradigma a seguir es supervised learning

In [None]:
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/235.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m153.6/235.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.4.0


In [None]:
import pandas as pd
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import LinearSVC


class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, config=None, max_features_text=50, auto_detect=False):
        """
        config: optional dict with keys 'binary', 'ordinal', 'onehot', 'label', 'text'
        auto_detect: if True, detects column types automatically in fit
        """
        self.config = config or {'binary': [], 'ordinal': {}, 'onehot': [], 'label': [], 'text': []}
        self.auto_detect = auto_detect
        self.max_features_text = max_features_text
        self.mappings_ = {}

    def fit(self, X, y=None):
        df = X.copy()
        if self.auto_detect:
            binary, onehot, label, text = [], [], [], []
            for col in df.columns:
                vals = df[col].dropna().unique()
                n_unique = len(vals)
                dtype = df[col].dtype
                if set(vals).issubset({'Si','No','S','N'}) and n_unique == 2:
                    binary.append(col)
                elif dtype == object and n_unique > self.max_features_text:
                    text.append(col)
                elif dtype == object and n_unique <= 10:
                    onehot.append(col)
                elif dtype == object:
                    label.append(col)
            self.config = {'binary': binary, 'ordinal': {}, 'onehot': onehot, 'label': label, 'text': text}

        self.mappings_['binary'] = {}
        for col in self.config.get('binary', []):
            vals = df[col].dropna().unique()
            self.mappings_['binary'][col] = {'Si': 1, 'No': 0} if 'Si' in vals else {'S': 1, 'N': 0}

        self.mappings_['ordinal'] = {}
        for col, order in self.config.get('ordinal', {}).items():
            mapping = {v: i for i, v in enumerate(order)} if order else {v: i for i, v in enumerate(sorted(df[col].dropna().unique()))}
            self.mappings_['ordinal'][col] = mapping

        self.ohe_ = {}
        for col in self.config.get('onehot', []):
            ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
            ohe.fit(df[[col]].astype(str))
            self.ohe_[col] = ohe

        self.le_ = {}
        for col in self.config.get('label', []):
            le = LabelEncoder()
            le.fit(df[col].dropna().astype(str))
            self.le_[col] = le

        self.tfidf_ = {}
        for col in self.config.get('text', []):
            tfidf = TfidfVectorizer(max_features=self.max_features_text, lowercase=True, ngram_range=(1,2), min_df=2, max_df=0.95)
            tfidf.fit(df[col].fillna('').astype(str))
            self.tfidf_[col] = tfidf

        return self

    def transform(self, X):
        df = X.copy()
        for col, m in self.mappings_.get('binary', {}).items():
            df[col] = df[col].map(m).fillna(0).astype(int)
        for col, m in self.mappings_.get('ordinal', {}).items():
            df[col] = df[col].map(m).fillna(-1).astype(int)
        for col, ohe in self.ohe_.items():
            arr = ohe.transform(df[[col]].astype(str))
            cols = [f"{col}_oh_{cat}" for cat in ohe.categories_[0]]
            df_oh = pd.DataFrame(arr, columns=cols, index=df.index)
            df = pd.concat([df, df_oh], axis=1)
            df.drop(columns=[col], inplace=True)
        for col, le in self.le_.items():
            df[col] = df[col].fillna('').apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
        for col, tfidf in self.tfidf_.items():
            arr = tfidf.transform(df[col].fillna('').astype(str)).toarray()
            cols = [f"{col}_tfidf_{i}" for i in range(arr.shape[1])]
            df_t = pd.DataFrame(arr, columns=cols, index=df.index)
            df = pd.concat([df, df_t], axis=1)
            df.drop(columns=[col], inplace=True)
        for col in df.columns:
            if df[col].isnull().any():
                vals = df[col].dropna().values
                df.loc[df[col].isnull(), col] = np.random.choice(vals, size=df[col].isnull().sum())
        return df


def load_data(path):
    df = pd.read_csv(path)
    print(f"Loaded {path} with shape {df.shape}")
    return df


def full_pipeline(train_path, test_path, id_col, target_col, config=None,
                  auto_detect=False, sample_frac=None, output_submission=None):
    """
    Train and evaluate using Logistic Regression
    """
    train = load_data(train_path)
    test = load_data(test_path)

    if sample_frac:
        train = train.sample(frac=sample_frac, random_state=42).reset_index(drop=True)
        test = test.sample(frac=sample_frac, random_state=42).reset_index(drop=True)

    source = [c for c in train.columns if c not in [id_col, target_col]]
    all_data = pd.concat([train[source], test[source]], ignore_index=True)

    pre = Preprocessor(config=config, auto_detect=auto_detect)
    pre.fit(all_data)
    prep = pre.transform(all_data)

    n = len(train)
    X_train, y_train = prep.iloc[:n].values, train[target_col].values
    X_test = prep.iloc[n:].values

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    print("Logistic Regression accuracy:", accuracy_score(y_train, model.predict(X_train)))

    preds = model.predict(X_test)
    sub = pd.DataFrame({id_col: test[id_col], target_col: preds})
    if output_submission: sub.to_csv(output_submission, index=False)
    return sub, model


def full_pipeline_svm(train_path, test_path, id_col, target_col, config=None,
                      auto_detect=False, sample_frac=None, output_submission=None):
    """
    Train and evaluate using LinearSVC (fast for large datasets).
    Hyperparámetros ajustados para velocidad y rendimiento.
    """
    train = load_data(train_path)
    test = load_data(test_path)

    if sample_frac:
        train = train.sample(frac=sample_frac, random_state=42).reset_index(drop=True)
        test = test.sample(frac=sample_frac, random_state=42).reset_index(drop=True)

    source = [c for c in train.columns if c not in [id_col, target_col]]
    all_data = pd.concat([train[source], test[source]], ignore_index=True)

    pre = Preprocessor(config=config, auto_detect=auto_detect)
    pre.fit(all_data)
    prep = pre.transform(all_data)

    n = len(train)
    X_train, y_train = prep.iloc[:n].values, train[target_col].values
    X_test = prep.iloc[n:].values

    svm = LinearSVC(C=0.5, max_iter=2000, dual=False)
    svm.fit(X_train, y_train)
    train_pred = svm.predict(X_train)
    print("LinearSVC train accuracy:", accuracy_score(y_train, train_pred))
    print(classification_report(y_train, train_pred))

    preds = svm.predict(X_test)
    sub = pd.DataFrame({id_col: test[id_col], target_col: preds})
    if output_submission: sub.to_csv(output_submission, index=False)
    return sub, svm


In [None]:

sub_svm, model_svm = full_pipeline_svm(
    'train.csv', 'test.csv',
    id_col='ID', target_col='RENDIMIENTO_GLOBAL',
    auto_detect=True, sample_frac=0.1,
    output_submission='submission_svm.csv'
)


Loaded train.csv with shape (692500, 21)
Loaded test.csv with shape (296786, 20)
LinearSVC train accuracy: 0.2927797833935018
              precision    recall  f1-score   support

        alto       0.32      0.54      0.40     17645
        bajo       0.28      0.24      0.26     17174
  medio-alto       0.10      0.00      0.00     17198
  medio-bajo       0.27      0.38      0.32     17233

    accuracy                           0.29     69250
   macro avg       0.24      0.29      0.24     69250
weighted avg       0.24      0.29      0.24     69250

