# Proyecto de Regresion Lineal Regularizada


IMPORTS

In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pickle import dump, load
from sklearn.feature_selection import chi2 , SelectKBest, mutual_info_regression
from sklearn.feature_selection import f_classif, f_regression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score, make_scorer, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import metrics

In [2]:
total_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/regularized-linear-regression-project-tutorial/main/demographic_health_data.csv")
total_data.to_csv("../data/raw/diabetes_data.csv", index=False)

total_data.head()

Unnamed: 0,fips,TOT_POP,0-9,0-9 y/o % of total pop,19-Oct,10-19 y/o % of total pop,20-29,20-29 y/o % of total pop,30-39,30-39 y/o % of total pop,...,COPD_number,diabetes_prevalence,diabetes_Lower 95% CI,diabetes_Upper 95% CI,diabetes_number,CKD_prevalence,CKD_Lower 95% CI,CKD_Upper 95% CI,CKD_number,Urban_rural_code
0,1001,55601,6787,12.206615,7637,13.735364,6878,12.370281,7089,12.749771,...,3644,12.9,11.9,13.8,5462,3.1,2.9,3.3,1326,3
1,1003,218022,24757,11.355276,26913,12.344167,23579,10.814964,25213,11.564429,...,14692,12.0,11.0,13.1,20520,3.2,3.0,3.5,5479,4
2,1005,24881,2732,10.980266,2960,11.896628,3268,13.13452,3201,12.865239,...,2373,19.7,18.6,20.6,3870,4.5,4.2,4.8,887,6
3,1007,22400,2456,10.964286,2596,11.589286,3029,13.522321,3113,13.897321,...,1789,14.1,13.2,14.9,2511,3.3,3.1,3.6,595,2
4,1009,57840,7095,12.266598,7570,13.087828,6742,11.656293,6884,11.901798,...,4661,13.5,12.6,14.5,6017,3.4,3.2,3.7,1507,2


# EDA - EXPLORATORY DATA ANALISYS

In [3]:
total_data.shape

(3140, 108)

In [4]:
total_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3140 entries, 0 to 3139
Columns: 108 entries, fips to Urban_rural_code
dtypes: float64(61), int64(45), object(2)
memory usage: 2.6+ MB


Viendo estos datos podemos comprobar un total de 3140 filas y 108 columnas, ahora comprobamos si existen datos duplicados.

In [5]:
total_data.duplicated().sum()

0

Existen demasiadas columnas, reducimos el analisis a solo diabeticos.

In [6]:
total_data.to_csv("/workspaces/Proyecto-de-Regresion-Lineal-Regularizada/data/raw/diabetes_data.csv", index= False)

FEATURE ENGINEERING

In [8]:
total_data = pd.read_csv("/workspaces/Proyecto-de-Regresion-Lineal-Regularizada/data/raw/diabetes_data.csv")


FEATURE SCALING

In [9]:
data_types = total_data.dtypes
numeric_columns = [c for c in list(data_types[data_types != "object"].index) if c != "Heart disease_number"]

scaler = StandardScaler()
norm_features = scaler.fit_transform(total_data[numeric_columns])

total_data_scal = pd.DataFrame(norm_features, index = total_data.index, columns = numeric_columns)
total_data_scal["Heart disease_number"] = total_data["Heart disease_number"]
total_data_scal.head()
total_data_scal.to_csv("../data/interim/scaled_features.csv", index=False)

FEATURE SELECTION

Ya hemos preparado el conjunto de datos ahora procememos a dividirlos:

In [12]:
# division entre train y test
X = total_data_scal.drop("Heart disease_number", axis = 1)
y = total_data_scal["Heart disease_number"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
train_indices = list(X_train.index)
test_indices = list(X_test.index)

# Seleccionamos el 30% de las mejores opciones
k = int(len(X_train.columns) * 0.3)
selection_model = SelectKBest(score_func = f_regression, k = k)
selection_model.fit(X_train, y_train)
ix = selection_model.get_support()

X_train_sel = pd.DataFrame(selection_model.transform(X_train), columns = X_train.columns.values[ix])
X_test_sel = pd.DataFrame(selection_model.transform(X_test), columns = X_test.columns.values[ix])

X_train_sel.head()

Unnamed: 0,TOT_POP,0-9,19-Oct,20-29,30-39,40-49,50-59,60-69,70-79,80+,...,Family Medicine/General Practice Primary Care (2019),Total Specialist Physicians (2019),Total Population,Population Aged 60+,county_pop2018_18 and older,anycondition_number,Obesity_number,COPD_number,diabetes_number,CKD_number
0,-0.232556,-0.227731,-0.234284,-0.232951,-0.226353,-0.231316,-0.229599,-0.233425,-0.23468,-0.23442,...,-0.212643,-0.20859,-0.231195,-0.229737,-0.233171,-0.23437,-0.232975,-0.223516,-0.218609,-0.219329
1,-0.158676,-0.178665,-0.180166,-0.188266,-0.17507,-0.161168,-0.134688,-0.105618,-0.11927,-0.091822,...,-0.11668,-0.11085,-0.150293,-0.098866,-0.152859,-0.142645,-0.155304,-0.11008,-0.131449,-0.130962
2,-0.199114,-0.211128,-0.195138,-0.166782,-0.195036,-0.194045,-0.199725,-0.219256,-0.222207,-0.205154,...,-0.192263,-0.217668,-0.197005,-0.216056,-0.195125,-0.193205,-0.201976,-0.193106,-0.189197,-0.206391
3,-0.036595,-0.037734,-0.017077,-0.057986,-0.052252,-0.033158,-0.020228,-0.032603,-0.023876,-0.046224,...,0.062458,-0.107888,-0.03694,-0.030034,-0.039882,-0.003321,0.006163,-0.007077,-0.047515,-0.045054
4,0.090839,0.09468,0.101662,0.056721,0.042392,0.068095,0.101699,0.144664,0.140685,0.166099,...,0.274818,0.194913,0.097767,0.161314,0.088485,0.165555,0.18274,0.265603,0.12304,0.132454


Guardamos los procesos de train y test

In [13]:
X_train_sel["Heart disease_number"] = list(y_train)
X_test_sel["Heart disease_number"] = list(y_test)

X_train_sel.to_csv("../data/processed/clean_train.csv", index = False)
X_test_sel.to_csv("../data/processed/clean_test.csv", index = False)

MODEL TRAINING

In [14]:
train_data = pd.read_csv("../data/processed/clean_train.csv")
test_data = pd.read_csv("../data/processed/clean_test.csv")

X_train = train_data.drop(["Heart disease_number"], axis = 1)
y_train = train_data["Heart disease_number"]
X_test = test_data.drop(["Heart disease_number"], axis = 1)
y_test = test_data["Heart disease_number"]

In [15]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
y_pred = model.predict(X_test)

print(f"Error Cuadrático Medio: {np.sqrt(mean_squared_error(y_test, y_pred))}")
print(f"R2 Score: {r2_score(y_test, y_pred)}")

Error Cuadrático Medio: 4755.442059695504
R2 Score: 0.7416268958253391


Estos datos no son muy precisos vamos a probar con los modelos Lasso y Ridge:

In [17]:
alpha = 1.0
lasso_model = Lasso(alpha = alpha)
ridge_model = Ridge(alpha = alpha)

lasso_model.fit(X_train, y_train)
ridge_model.fit(X_train, y_train)

lasso_y_pred = lasso_model.predict(X_test)
ridge_y_pred = ridge_model.predict(X_test)

print(f"Error cuadrático medio de raiz de Lasso: {np.sqrt(mean_squared_error(y_test, lasso_y_pred))}")
print(f"R2 Score: {r2_score(y_test, lasso_y_pred)}")

print(f"Error cuadrático medio de raiz de Ridge: {np.sqrt(mean_squared_error(y_test, ridge_y_pred))}")
print(f"R2 Score: {r2_score(y_test, ridge_y_pred)}")

Error cuadrático medio de raiz de Lasso: 429.62231187142436
R2 Score: 0.9978911816625889
Error cuadrático medio de raiz de Ridge: 408.9039633176806
R2 Score: 0.9980896710652084


  model = cd_fast.enet_coordinate_descent(


Los datos ahora son mejores, guardamos.

In [19]:
from pickle import dump

with open("../models/lasso-1.0.pkl", "wb") as f:
    dump(lasso_model, f)

with open("../models/ridge-1.0.pkl", "wb") as f:
    dump(ridge_model, f)