# 1. Preprocesamiento de los datos

A continuación se realizarán las funciones que permitiran usar el conjunto de datos 'Salary_data', con el que se pretende predecir cuál será el salario de una persona, dependiendo de su nivel académico, años de experiencia, cargo al que aspira.

In [12]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import OneHotEncoder

class pre:
    def creation_data(self, path_csv):
        df = pd.read_csv(path_csv)
        return df
    def declaration_var(self, data, target):
        target_1 = data[target]
        features = data.drop(target, axis=1)
        names_f = {name:set() for name in features}
        for i in features:
            index_drop = features[i][features[i].isnull() == True].index
            features.drop(index_drop, inplace = True)    
            target_1.drop(index_drop, inplace = True)
        for i in features:
            for j in features[i]:
                names_f[i].add(j)   
        index_targ = target_1[target_1.isnull() == True].index
        target_1.drop(index_targ, inplace=True)   
        features.drop(index_targ, inplace=True)                    
        return target_1, features, names_f
    def replace(self, variable, var_1, var_2, features):
        new_values = [var_1 if value == var_1 or value == var_2 else value for value in features[variable]]
        features[variable] = new_values
        return features

Importación de las librerías que se usarán

In [13]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error

processing = pre()

Definición del conjunto de datos del que se hará uso y la variable objetivo, junto a las demás variables

In [14]:
from sklearn.compose import ColumnTransformer

x = r'C:\Users\santi\OneDrive - Universidad Pedagogica Nacional\El_bosque\tercer_semestre\machine_learning_ll\proyecto\Salary_Data.csv'
data = pre().creation_data(x)
target, features, names_f = pre().declaration_var(data, 'Salary')

Luego de definir el target y features, se determina que las variables "Age","Years of Experience" pueden tener un diferente tipo de formato, con el que se consuma menos recursos computacionales.

También, se encontró que en la variable Education Level, poseía errores tipográficos, donde los diferentes elementos que lo conforman estaban nombrados de diferente forma, apesar de representar el mismo objeto.

Por ejemplo, Bachelor's Degree representa una persona con un título de bachiller y Bachelor's es la forma en la que se puede resumir a una persona con el mismo título. Por lo tanto, ambos elementos comparten las mismas características y deben tomarse como objeto único.


In [15]:

# Cambiar el tipo de formato
features['Age'] = features['Age'].astype('int16')
features['Years of Experience'] = features['Years of Experience'].astype('int64')

# Unir las variables repetidas
features = pre().replace('Education Level', "Bachelor's", "Bachelor's Degree", features)
features = pre().replace('Education Level', "Master's", "Master's Degree", features)
features = pre().replace('Education Level', "PhD", "phD", features)

num_features = ['Age', 'Years of Experience']
cat_features = ['Gender', 'Education Level', 'Job Title']

Se define los pipelines para las variables categóricas y numéricas, además se transforma en el tipo de dato correcto para que se pueda entrenar con los diferentes modelos de regresión

In [16]:

def pipe(categorical_features, features):
    num_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    cat_transformer = Pipeline(steps=[('one_hot', OneHotEncoder(handle_unknown='ignore'))])
    return num_transformer, cat_transformer

def column_trans(num_transformer, num_features, cat_transformer, cat_features):
    preprocessor = ColumnTransformer(
        transformers=[
            ('numerical', num_transformer, num_features),
            ('categorical', cat_transformer, cat_features)
        ]
    )
    return preprocessor

num_transformer, cat_transformer = pipe(cat_features, features)
preprocessor = column_trans(num_transformer, num_features, cat_transformer, cat_features)

## Models

Con esta función se crean diferentes modelos de regresión como Lineal, RandomForest, Ridge

In [17]:
def gen_pipe( preprocessor):
    models = []    
    pipeline_ml = Pipeline(
        steps = [('preprocessor_column', preprocessor),
                ('linear_model', LinearRegression())])
    models.append(pipeline_ml)
        
    pipeline_ridge = Pipeline(
        steps = [('preprocessor_column', preprocessor),
                ('model_ridge', Ridge(alpha=0.2))])
    models.append(pipeline_ridge)
    pipeline_rf = Pipeline(
        steps=[('preprocessor_column', preprocessor),
            ('random_forest', RandomForestRegressor(n_estimators=200))])
    models.append(pipeline_rf)
    
    return models

División de los datos de entrenamiento y de prueba

In [18]:
from sklearn.model_selection import train_test_split 
from sklearn.utils import resample

X_train, X_test,y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
models = gen_pipe(preprocessor)

In [37]:
models_metrics = {i:[] for i in range(len(models)) }



for i in models_metrics:
    n_bootstraps = 1000
    model = models[i].fit(X_train, y_train)
    prediction = model.predict(X_test)
    bootstraps_predict = []
    
    for _ in range(n_bootstraps):
        muestras_bootstrap = resample(prediction)
        statics_bootstrap = np.mean(muestras_bootstrap)
        bootstraps_predict.append(statics_bootstrap)

    int_conf = np.percentile(bootstraps_predict, [5, 95])
    
    for j in range(len(int_conf)):
        int_conf[j] = round(int_conf[j], 2)
    
    r2 = r2_score(y_test, prediction)
    mse = mean_squared_error(y_test, prediction)
    rmse = np.sqrt(mean_squared_error(y_test, prediction))
    list_1 = [r2,mse,rmse, int_conf]
    models_metrics[i] = [metrics for metrics in list_1]
    
    
        


In [38]:
names = ['LinearRegression', 'Ridge', 'RandomForestRegressor']
index_final = ['R2', 'mse', 'rmse', 'intervalo de confianza']

#Agregar más métricas de desempeño f1, recall, intervalos de confianza, cross_validation, 
#Agregar matrices de confusión, flasos positivos y demás


models_metrics = pd.DataFrame(data = models_metrics,index = index_final)
models_metrics.columns = names
models_metrics

Unnamed: 0,LinearRegression,Ridge,RandomForestRegressor
R2,0.840184,0.840205,0.974792
mse,455709421.883616,455648120.237456,71880515.531615
rmse,21347.351636,21345.915774,8478.237761
intervalo de confianza,"[112686.2, 117158.74]","[112764.67, 117218.91]","[112577.93, 117341.32]"


In [21]:
from sklearn.utils import resample 

