Importar librerias necesarias

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler

import warnings
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
warnings.filterwarnings("ignore")

## 1. Recolección de la data

In [None]:
df = pd.read_csv("raw_dataset.csv")
df.head()

## 2. Preparación / preprocesamiento de la data

##### a. Eliminación de características redundantes o innecesarias


In [None]:
df.drop_duplicates(inplace=True)

##### b. Limpieza de filas nulas, vacías o con error

In [None]:
df.replace(["", " ", "?", "None", "N/A", "na"], pd.NA, inplace=True)

df_cleaned = df.dropna()
df_cleaned.reset_index(drop=True, inplace=True)

##### c. Encoder o codificador a las características no numéricas

In [None]:
label_encoder = LabelEncoder()

# Fit and transform the data
df["smoker"].replace({"yes": 1, "no": 0}, inplace=True)
df["sex"].replace({"male": 1, "female": 0}, inplace=True)
df["region"] = label_encoder.fit_transform(df["region"])

##### d. Normalizar y estandarizar la data con un escalador de datos

In [None]:
num_data = df.select_dtypes(include='number')

scaler = StandardScaler()
scaled_data = scaler.fit_transform(num_data)
df_scaled = pd.DataFrame(scaled_data, columns=num_data.columns)

df = df_scaled

## 3. Análisis descriptivo de la data (EDA)

##### a. Analisis de la data con gráficas

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(data=df,x=df["age"], kde=True, color='skyblue')
plt.title('Distribución de la edad de los contratistas', fontsize=16)
plt.xlabel('Age', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.show()

In [None]:
f= plt.figure(figsize=(12,6))

ax=f.add_subplot(121)
sns.distplot(df[(df.smoker == 1)]["charges"],color='b',ax=ax)
ax.set_title('Distribución de cargos para fumadores')

ax=f.add_subplot(122)
sns.distplot(df[(df.smoker == 0)]["charges"],color='r',ax=ax)
ax.set_title('Distribución de cargos para fumadores')
plt.show()

In [None]:
plt.figure(figsize=(12,5))
sns.lineplot(data=df,x=df["age"],y=df["children"], color='skyblue')
plt.title('Distribución de la edad de los contratistas.', fontsize=16)
plt.xlabel('Age', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.show()

##### b. Interpretación las estadísticas de los datos

In [None]:
data = df
plt.figure(figsize=(6, 6))
sns.catplot(x="smoker", kind="count", hue="sex", palette="pink", data=data)
plt.show()

In [None]:
sns.violinplot(
    data=data[data["sex"] == 1], x=data["smoker"], y="charges", palette="magma"
)

##### c. Interpretación de patrones de los datos con consultas y métodos de visualización.

In [None]:
df["charges"] = df["charges"].round()
df.groupby([df["sex"], df["smoker"], df["region"]])["charges"].sum().reset_index()

In [None]:
df[df["age"] < 25]

In [None]:
sns.lineplot(data=df[((df["age"]<25) &( df["smoker"]==1)&( df["sex"]==0))],x="age",y="charges")
sns.lineplot(data=df[((df["age"]<25) &( df["smoker"]==0)&( df["sex"]==0))],x="age",y="charges")

In [None]:
sns.jointplot(data[data['smoker']==1],x='age',y='charges',color='m')
sns.scatterplot(data[data['smoker']==1],x='age',y='charges',color='r')
sns.scatterplot(data[data['smoker']==0],x='age',y='charges',color='b')
plt.show()

In [None]:
sns.lmplot(x="age", y="charges", hue="smoker", data=data, palette = 'inferno_r')
ax.set_title('Smokers and non-smokers')
plt.show()

In [None]:
sns.boxplot(data=data,x=data['region'],y=data["charges"])
plt.show()

In [None]:
plt.figure(figsize=(12,5))
plt.title("Distribution of bmi")
ax = sns.distplot(data["bmi"], color = 'c')

In [None]:
sns.scatterplot(data,x="bmi",y="charges", color = 'c',hue='smoker')

In [None]:
f= plt.figure(figsize=(12,6))

ax=f.add_subplot(121)
sns.distplot(df[(df.bmi>=3)]["charges"],color='b',ax=ax)
ax.set_title('Distribution of charges for Bmi >= 30')

ax=f.add_subplot(122)
sns.distplot(df[(df.bmi <=30)]["charges"],color='c',ax=ax)
ax.set_title('Distribution of charges for Bmi >= 30')
plt.show()

In [None]:
sns.catplot(x="smoker", kind="count", palette="rainbow",hue = "sex",
            data=data[(data.children > 0)])
ax.set_title('Smokers and non-smokers who have childrens')

In [None]:
sns.histplot(df[(df.children>=3)]["charges"],color='b',kde=True)

In [None]:
sns.histplot(df[(df.children<3)]["charges"],color='c',kde=True)

## 4. Entrenamiento del modelo

##### Ordinary Least Squares Regression

In [None]:
X = df.drop("charges", axis=1)
y = df["charges"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Realizar predicciones
y_train_pred = lr_model.predict(X_train)
y_test_pred = lr_model.predict(X_test)

# Desempeño del modelo
test_score = lr_model.score(X_test, y_test)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

print("Score (R²):", test_score)
print("Error Cuadrático Medio (MSE):", mse_test)
print("Coeficiente de Determinación (R²):", r2_test)

##### Ridge Regression

##### Bayesian Regression

##### Lasso Regression

##### Nearest Neighbors Regression

##### Random Forest Regression

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=1, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Realizar predicciones
y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)

# Evaluar el desempeño del modelo
test_score_rf = rf_model.score(X_test, y_test)
mse_test_rf = mean_squared_error(y_test, y_test_pred_rf)
r2_test_rf = r2_score(y_test, y_test_pred_rf)

print("Score (R²):", test_score_rf)
print("Error Cuadrático Medio (MSE):", mse_test_rf)
print("Coeficiente de Determinación (R²):", r2_test_rf)

##### SVM (Support Vector Machine) Regression

##### Neural Network MLP Regression

## 5. Validación y testeo del modelo

## 6. Despliegue del modelo y comprobación con data recién creada