# APRENDIZAJE SUPERVISADO

## Importación de librerias

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Lectura del dataset

In [None]:
URL_TRAIN_DATA = "https://www.famaf.unc.edu.ar/~nocampo043/travel_insurance_prediction_train.csv"
URL_TEST_DATA = "https://www.famaf.unc.edu.ar/~nocampo043/travel_insurance_prediction_test.csv"


df_train = pd.read_csv(URL_TRAIN_DATA)
df_test = pd.read_csv(URL_TEST_DATA)

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
df_train

In [None]:
df_test

## Análisis descriptivo

In [None]:
df_train.describe().round()

### Variable objetivo o target: Travel Insurance

In [None]:
df_train["TravelInsurance"].value_counts()

In [None]:
fig = plt.figure(figsize=(5,5))
sns.countplot(data=df_train, x="TravelInsurance")

### Correlación variables

In [None]:
corr = df_train[["AnnualIncome", "Age"]].corr().round(2)
corr = corr[['TravelInsurance']]
corr.loc[:, 'abs_corr'] = np.abs(corr['TravelInsurance'])
corr.sort_values(by='abs_corr', ascending=False)

In [None]:
plt.figure(figsize=(10,10))
corr = df_train.corr()
sns.heatmap(corr, xticklabels = corr.columns, yticklabels = corr.columns, annot=True, cmap= 'coolwarm')
plt.show()

### Variable explicativa: Age

En primer lugar, observamos la distribución de la variable Age en el df_train.

In [None]:
sns.countplot(data=df_train, x="Age", hue="TravelInsurance")

In [None]:
df_train[["TravelInsurance", "Age"]].groupby("TravelInsurance").describe()

Podemos observar que se encuentran fluctuaciones con respecto a la edad.

### Variable explicativa: AnnualIncome

En primer lugar, observamos la distribución de la variable Annual Income en el df_train.

In [None]:
fig = plt.figure(figsize=(8,8))
sns.boxenplot(data=df_train, x="AnnualIncome")
plt.ticklabel_format(style='plain', axis='x')

In [None]:
fig = plt.figure(figsize=(8,8))
sns.boxenplot(data=df_train, x="TravelInsurance", y="AnnualIncome")
plt.ticklabel_format(style='plain', axis='y')

In [None]:
(
    df_train[["TravelInsurance", "AnnualIncome"]]
        .groupby(["TravelInsurance"])
        .describe()
)

Podemos observar que la distribución de la variable Annual Income se ve afectada en gran medida, al condicionarla por las distintas clases de la variable TARGET. Si bien los valores mínimos y máximos son similares, la media y la mediana difieren considerablemente, como así también el rango intercuantil.
Es una varible que se considera importante para explicar el comportamiento de Y.

### Variable explicativa: Employment Type

In [None]:
df_train["Employment Type"].value_counts()
pd.crosstab(df_train["TravelInsurance"], df_train["GraduateOrNot"])

In [None]:
fig = plt.figure(figsize=(5, 5))
sns.countplot(data=df_train,
              x="TravelInsurance",
              hue=df_train["Employment Type"])
pd.crosstab(df_train["TravelInsurance"], df_train["Employment Type"])

### Variable explicativa: Graduate Or Not

In [None]:
df_train["GraduateOrNot"].value_counts()

In [None]:
fig = plt.figure(figsize=(5, 5))
sns.countplot(data=df_train,
              x="TravelInsurance",
              hue=df_train["GraduateOrNot"])

In [None]:
df_train

### Variable explicativa: Frequent Flyer

In [None]:
df_train["FrequentFlyer"].value_counts()

### Variable explicativa: Ever Travelled Abroad

In [None]:
df_train["EverTravelledAbroad"].value_counts()

### Variable explicativa: Employment Type

In [None]:
sns.countplot(data=df_train, x="Employment Type")

Para el entrenamiento de los modelos no utilizaremos las variable Customer y Travel Insurance (Target)

In [None]:
X_train_total = df_train.drop(["Customer","TravelInsurance"], axis=1)
Y_train_total = df_train["TravelInsurance"]

### Variable explicativa: Family Members

In [None]:
df_train.columns

In [None]:
plt.figure(figsize=(10,8))
sns.histplot(data=df_train, x="FamilyMembers", hue= 'TravelInsurance', multiple="stack")

### Variable explicativa: Chronic Diseases

In [None]:
plt.figure(figsize=(10,8))
ax= sns.countplot(data= df_train, x="ChronicDiseases", hue= "TravelInsurance")
legend_labels, _= ax.get_legend_handles_labels()
ax.legend(legend_labels, ['Not buyed', 'Buyed'], #ver de cambiar!
          title_fontsize = 18,
          fontsize = 15,
          bbox_to_anchor=(1,1),
          title='Travel Insurance')

### Variable explicativa: FrequentFlyer

In [None]:
fig = plt.figure(figsize=(5,5))
sns.countplot(data=df_train, x="TravelInsurance", hue=df_train.FrequentFlyer)

pd.crosstab(df_train["TravelInsurance"], df_train["FrequentFlyer"])

Se puede observar en este gráfico que si no sos viajero frecuente la cantidad
de clientes que contratan un seguro es baja, caso contrario la cantidad de
contratar un seguro es pareja.

### Variable explicativa: EverTravelledAbroad

In [None]:
fig = plt.figure(figsize=(5, 5))
sns.countplot(data=df_train, x="TravelInsurance", hue=df_train.EverTravelledAbroad)

pd.crosstab(df_train["TravelInsurance"], df_train["EverTravelledAbroad"])

## Encoding variables

In [None]:
numerical_cols = X_train_total.select_dtypes(include=['float64', 'int64']).columns

In [None]:
numerical_cols

In [None]:
categorical_cols = X_train_total.select_dtypes(include=['object']).columns

In [None]:
categorical_cols

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
encoder.fit(X_train_total[categorical_cols])
# We can inspect the categories found by the encoder
encoder.categories_

In [None]:
encoded_types = encoder.transform(X_train_total[categorical_cols])
encoded_types[:10]

In [None]:
X_train_total = X_train_total[numerical_cols].values
X_train_total[:10]

In [None]:
X_train_enc = np.hstack((encoded_types, X_train_total))
X_train_enc [:5]

In [None]:
X_train_enc.shape

## Creación del train y validation

In [None]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train_total,
                                                      Y_train_total,
                                                      test_size=0.2,
                                                      random_state=0)

## Modelos propuestos

## Primer modelo de prueba: Regresión Logística

In [None]:
#X_train_lreg = X_train_enc[np.concatenate((num_cols,cat_cols))]
#X_val_lreg = X_val_c[np.concatenate((num_cols,cat_cols))]

pipe = Pipeline([('scaler', StandardScaler()), ('lreg', LogisticRegression(random_state=0))])

pipe.fit(X_train, Y_train)

In [None]:
pipe['lreg'].classes_

In [None]:
pipe['lreg'].get_params()

In [None]:
Y_train_predic_lreg = pipe.predict(X_train)
Y_val_pred_lreg = pipe.predict(X_valid)

In [None]:
text = "Logistic Regression - Reporte de clasificación del conjunto de train"
print(len(text)*"=")
print(text)
print(len(text)*"=")
print(classification_report(Y_train, Y_train_predic_lreg))

In [None]:
text = "Logistic Regression - Reporte de clasificación del conjunto de validation"
print(len(text)*"=")
print(text)
print(len(text)*"=")
print(classification_report(Y_valid, Y_val_pred_lreg))

## Segundo modelo de prueba: Árbol de decisión

In [None]:
clf_tree = DecisionTreeClassifier(random_state=0)
clf_tree.fit(X_train, Y_train)

In [None]:
Y_train_pred = clf_tree.predict(X_train)
Y_valid_pred = clf_tree.predict(X_valid)

In [None]:
train_acc = accuracy_score(Y_train, Y_train_pred)
valid_acc = accuracy_score(Y_valid, Y_valid_pred)
print(f'Train accuracy: {train_acc:0.2}')
print(f'Test accuracy: {valid_acc:0.2}')