In [1]:
#Sube un archivo de Kaggle a la Máquina Virtual de Colab en un .zip
#Requiere tener el archivo 'kaggle.json' en Drive
#Requiere aceptar las reglas de la competencia en Kaggle

import json
from google.colab import drive

!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json

drive.mount('/content/drive', force_remount=True)
with open("/content/drive/My Drive/kaggle.json", 'r') as f:
    api_token= json.load(f)

with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)

!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download -c tp2-predictivo-2023q2

Mounted at /content/drive
Downloading tp2-predictivo-2023q2.zip to /content
100% 40.0M/40.2M [00:15<00:00, 4.22MB/s]
100% 40.2M/40.2M [00:15<00:00, 2.78MB/s]


In [2]:
#Extraigo todos los archivos que están en el .zip que acabamos de "bajar".
import zipfile
import os

os.listdir()

for file in os.listdir():
    if file.endswith('.zip'):
      zip_ref = zipfile.ZipFile(file, 'r')
      zip_ref.extractall()
      zip_ref.close()

In [3]:
_=!pip install feature_engine

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#exportar pipes
import pickle
import joblib

#crear directorio caché
from tempfile import mkdtemp
from shutil import rmtree

# from tqdm import tqdm
# from sklearn import set_config
# from sklearn.base import clone

#pipeline y search
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, ParameterGrid, train_test_split, cross_val_score, KFold
from sklearn.feature_selection import SelectPercentile, chi2

#modelos
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer

#transformaciones
from feature_engine.encoding import RareLabelEncoder, OrdinalEncoder, OneHotEncoder, CountFrequencyEncoder
from feature_engine.imputation import AddMissingIndicator, MeanMedianImputer, CategoricalImputer, EndTailImputer, RandomSampleImputer
from feature_engine.transformation import PowerTransformer
from feature_engine.creation import RelativeFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

#distribuciones
import scipy as sp
from scipy.stats import randint, uniform

# from sklearn.compose import ColumnTransformer, make_column_transformer
# from sklearn.impute import KNNImputer, SimpleImputer
# from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer

In [4]:
train = pd.read_csv('/content/origen.csv')

In [5]:
train = train.drop(columns='Unnamed: 0') #saco columna id

#Correcciones Train
train['isAdult'] = train['isAdult'].replace(2014, 0)
train['isAdult'] = train['isAdult'].replace(2020, 0)
train['runtimeMinutes'] = train['runtimeMinutes'].replace(-22336, 0)
train['runtimeMinutes'] = train['runtimeMinutes'].replace(-14116, 0)
train['video'] = train['video'].replace(False,0)
train['video'] = train['video'].replace(True,1)
train['adult'] = train['adult'].replace(False,0)
train['adult'] = train['adult'].replace(True,1)

In [6]:
train.loc[train['startYear'] == 0, 'startYear'] = float("NaN")
train.loc[train['endYear'] == 0, 'endYear'] = float("NaN")
train.loc[train['runtimeMinutes'] == 0, 'runtimeMinutes'] = float("NaN")
train.loc[train['language'] == '0', 'language'] = float("NaN")
train.loc[train['attributes'] == '0', 'attributes'] = float("NaN")
train.loc[train['budget'] == 0, 'budget'] = float("NaN")
train.loc[train['genres_x'] == '0', 'genres_x'] = float("NaN")
train.loc[train['genres_y'] == '0', 'genres_y'] = float("NaN")
train.loc[train['production_companies'] == '0', 'production_companies'] = float("NaN")
train.loc[train['production_countries'] == '0', 'production_countries'] = float("NaN")
train.loc[train['revenue'] == 0, 'revenue'] = float("NaN")
train.loc[train['runtime'] == 0, 'runtime'] = float("NaN")
train.loc[train['directors'] == '0', 'directors'] = float("NaN")
train.loc[train['writers'] == '0', 'writers'] = float("NaN")

In [7]:
X = train.drop(columns=['averageRating','tagline','production_companies','production_countries'])
y = train['averageRating']

X['writers'] = X['writers'].replace(np.nan,'unk')
X['cant_writers'] = X['writers'].apply(lambda x: x.count(',')+1 if 'unk' not in x else np.nan)
X['directors'] = X['directors'].replace(np.nan,'unk')
X['cant_directors'] = X['directors'].apply(lambda x: x.count(',')+1 if 'unk' not in x else np.nan)
X['genres_x'] = X['genres_x'].replace(np.nan,'unk')
X['cant_genres_x'] = X['genres_x'].apply(lambda x: x.count(',')+1 if 'unk' not in x else np.nan)

txt = ['tagline'] if 'tagline' in X.columns else []
bool = [c for c in X.columns if X[c].nunique() <= 2]
cat = [c for c in X.columns if X[c].dtype == 'O' and c not in bool]
num = [c for c in X.columns if X[c].dtype != 'O' and c not in bool]

print(f"Texto: {txt} \n Booleanas: {bool} \n Categóricas: {cat} \n Numéricas: {num} \n")
print("Total variables:",len(txt)+len(bool)+len(cat)+len(num))

Texto: [] 
 Booleanas: ['isAdult', 'isOriginalTitle', 'adult', 'video'] 
 Categóricas: ['titleType', 'genres_x', 'directors', 'writers', 'language', 'attributes', 'genres_y', 'original_language', 'status'] 
 Numéricas: ['numVotes', 'startYear', 'endYear', 'runtimeMinutes', 'seasonNumber', 'episodeNumber', 'ordering', 'budget', 'popularity', 'revenue', 'runtime', 'cant_writers', 'cant_directors', 'cant_genres_x'] 

Total variables: 27


In [8]:
pipe = Pipeline([

    # Missing values -----------------------------------------------------------
    ('cat_imputer', CategoricalImputer(fill_value="unk", return_object=True, ignore_format=True, variables=cat)),
    ('num_missing_ind', AddMissingIndicator(missing_only=True, variables=num)),
    ('num_imputer', EndTailImputer(imputation_method='iqr', tail='right', fold=3, variables=num)),
    ('bool_imputer', SklearnTransformerWrapper(transformer = SimpleImputer(strategy='most_frequent'),variables = bool)),

    # Categorical encoding -----------------------------------------------------
    ('cat_encoder', CountFrequencyEncoder(encoding_method='frequency',variables=cat)),

 ])

In [9]:
# Aplicación Transformaciones
X = pipe.fit_transform(X)

# Entreno Modelo
modelo = XGBRegressor(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=15,
        reg_lambda=4,
        random_state=22)

modelo.fit(X,y)

In [10]:
#Guarda el mejor pipe y mejor en un archivo para poder cargarlo cuando quiera. La carga permite predecir datos nuevos directamente, sin entrenarlo de nuevo.
joblib.dump(pipe,'pipe.joblib')
joblib.dump(modelo,'modelo.joblib')

['modelo.joblib']