## Librerias útiles

In [10]:
# Data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

import warnings
warnings.filterwarnings('ignore') # To hide all python warnings

# Visualization
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

# Machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

from scipy import stats # This module contains a large number of probability distribution
from scipy.stats import norm # To analize normal distribution
from sklearn.preprocessing import StandardScaler # Transform data such that its distribution will have a mean value 0 and sd of 1

from IPython.display import clear_output
#!pip3 install -U keras
#!pip3 install -U tensorflow
#clear_output()

# Neural networks
# from keras.utils.np_utils import to_categorical
# from keras.models import Sequential
# from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
# from tensorflow.keras.optimizers import RMSprop
# from keras.preprocessing.image import ImageDataGenerator
# from keras.callbacks import ReduceLROnPlateau

# Cool
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split

# Lazy classifier
from lightgbm import LGBMClassifier
import lazypredict
from lazypredict.Supervised import LazyClassifier


from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')
plt.rcParams.update({'figure.figsize':(10,8), 'figure.dpi':100}) # Setup of plt chars

## 1) Adquisición de los datos

In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

combine = [df_train, df_test] # To run certain operations on both datasets together


## 2) Entender el problema

### 👉 Análisis describiendo los datos

####  📌 Características disponibles en el dataset (columnas)

In [None]:
df_train.columns

##### 🔹 ¿Cuáles son? Categóricas, ordinales, continuas o mixtas 🤔

In [None]:
df_train.info()

##### 🔹 ¿Cuáles podrían tener errores o errores tipográficos?  🤔

##### 🔹 ¿Cuáles podrían tener valores en blanco, nulos o vaciós? 🤔

In [None]:
df_train.isnull().any().describe()

##### 🔹 ¿Qué tipo de dato (string, integer, float, etc..) presentan las variable?

In [None]:
df_train.info()

##### 🔹 ¿Cuál es la distribución de los valores numéricos de las características de la muestra?

In [None]:
# Global
df_train.describe()

# Specific
df_train['Var'].describe()

##### 🔹 ¿Y la distribución para las variables categóricas?

In [None]:
df_train.describe(include=['O'])

##### 🔹 ¿Distribución de las categorías de cada variable categórica en función de la variable dependiente? 

In [None]:
df_train[['VarCategórica', 'VarDependiente']].groupby(['VarCategorica'], as_index=False).mean().sort_values(by='VarDependiente', ascending=False)

### 👉 Análisis visualizando los datos

##### 🔹 Histograma de una variable numérica con la variable dependiente de clasificacion

In [None]:
grid = sb.FacetGrid(df_train, col='Y')
grid.map(plt.hist, 'X', bins=20)

##### 🔹 Histograma de una variable numérica con la variable dependiente de clasificacion pero clasificado por las clases de otra variable

In [None]:
grid = sb.FacetGrid(df_train, col='Y', row='X2', height=2.2, aspect=1.6)
grid.map(plt.hist, 'X', alpha=.5, bins=20)
grid.add_legend();

##### 🔹 Gráfico de puntos entre una variable de clasificación y otra ordinal clasificadas segun las clases de otra variable

In [None]:
# grid = sns.FacetGrid(df_train, col='Embarked')
grid = sb.FacetGrid(df_train, row='Embarked', height=2.2, aspect=1.6)
grid.map(sb.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep', order=[1,2,3], hue_order=["female","male"])
grid.add_legend()

##### 🔹 Gráfico de barras entre una variable numerica y otra binaria pero clasificadas por las combinaciones que resulten entre dos variables categoricas

In [None]:
# grid = sb.FacetGrid(df_train, col='Embarked', hue='Survived', palette={0: 'k', 1: 'w'})
grid = sb.FacetGrid(df_train, row='X', col='Y', height=2.2, aspect=1.6)
grid.map(sb.barplot, 'X2', 'X3', alpha=.5, ci=None, order=["X2-Class1","X2-Class2"])
grid.add_legend()

##### 🔹 Boxplot entre una variable categorica y una variable dependiente numerica

In [None]:
sb.boxplot(x = df_train.Y,
           y = df_train.X)

##### 🔹 Histogram to check distribution of a numerical variable 

In [None]:
sb.distplot(df_train['Var'])

# If you want to check skewness and kurtosis
print("Skewness: %f" % df_train['Var'].skew())
print("Kurtosis: %f" % df_train['Var'].kurt())

##### 🔹 Plot scatter to check relationship with numerical vairables

In [None]:
var = 'X'
data = pd.concat([df_train['Y'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='Y', ylim=(0,800000));

##### 🔹 Una distribución de valores nulos a lo largo?

In [None]:
train_null = pd.DataFrame(train.isna().sum())
train_null = train_null.sort_values(by = 0, ascending = False)[:-1]

fig = make_subplots(rows = 1,
                    cols = 1,
                    column_titles = ["Train Data"],
                    x_title = "Missing Values")

fig.add_trace(go.Bar(x = train_null[0],
                     y = train_null.index,
                     orientation = "h",
                     marker=dict(color=[n for n in range(12)],
                                 line_color='rgb(0,0,0,0)',
                                 line_width = 2,
                                 coloraxis="coloraxis")),
              1,1)

##### 🔹Grafico de cortas?? No problem!!

In [None]:
f = pd.concat([train[FEATURES], test[FEATURES]], axis = 0)

text_features = ["Cabin", "Name"]

# categorical features
cat_features = [col for col in FEATURES if df[col].nunique() < 25 and col not in text_features]
# nunique() -> Count number of distinct elements in specified axis.

# continuos features
cont_features = [col for col in FEATURES if df[col].nunique() >= 25 and col not in text_features]

labels=['Categorical', 'Continuos', 'Text']
values=[len(cat_features), len(cont_features), len(text_features)]
colors=['#DE3163', '#58D68D']

fig = go.Figure(data=[go.Pie(
        labels=labels,
        values=values,
        pull=[0.1, 0],
        marker= dict(colors=colors,
                     line=dict(color='#999000',
                              width=2)))])

fig.show()

##### Y éste? ES MUY BUENO! Histograma en conjunto de dos variables continuas

In [None]:
#  Feature Distribution of Continous Features

# Age distribution

train_age = train.copy()
test_age = test.copy()

train_age["type"] = "Train"
test_age["type"] = "Test"

ageDf = pd.concat([train_age, test_age])
fig = px.histogram(data_frame = ageDf,
                   x = 'Age',
                   color = "type",
                   color_discrete_sequence= ['#58D68D', '#DE3163'],
                   marginal="box",
                   nbins=100,
                   template="plotly_white"
                  )

fig.update_layout(title = "Distribution of Age", title_x = 0.5)
fig.show()

##### Gráfico de tortas? No problem! 

In [None]:
f = pd.concat([train[FEATURES], test[FEATURES]], axis = 0)

text_features = ["Cabin", "Name"]

# categorical features
cat_features = [col for col in FEATURES if df[col].nunique() < 25 and col not in text_features]
# nunique() -> Count number of distinct elements in specified axis.

# continuos features
cont_features = [col for col in FEATURES if df[col].nunique() >= 25 and col not in text_features]

labels=['Categorical', 'Continuos', 'Text']
values=[len(cat_features), len(cont_features), len(text_features)]
colors=['#DE3163', '#58D68D']

fig = go.Figure(data=[go.Pie(
        labels=labels,
        values=values,
        pull=[0.1, 0],
        marker= dict(colors=colors,
                     line=dict(color='#999000',
                              width=2)))])

fig.show()

In [None]:
# Relationship with categorical features
var = 'X'
data = pd.concat([df_train['Y'], df_train[var]], axis=1)
f, ax = plt.subplots(figsize=(8,6))
fig = sb.boxplot(x=var, y='Y', data=data)
fig.axis(ymin=0, ymax=800000)

##### Count plot

In [None]:
g = sb.countplot(x = df_train)
df_train.value_counts()

### 👉 Discusión de los datos

####  📌 Visualizaciones chulas

##### 🔹 Imprimir cosas con colores?

In [None]:
# Quick view of train data

print(f'\033[94mNummber of rows in train data: {train.shape[0]}')
print(f'\033[95mNummber of columns in train data: {train.shape[1]}')
print(f'\033[96mNummber of values in train data: {train.count().sum()}')
print(f'\033[97mNummber of missing values in train data: { sum(train.isna().sum()) }')

##### 🔹  Imprimir los valores nulos en un color especial? 

In [None]:
print(f'\033[94m')
print(train.isnull().sum().sort_values(ascending=False))

##### 🔹  Una tabla con valores coloridos?

In [None]:
train.iloc[:, :-1].describe().T.sort_values(by='std', ascending=False)\
                    .style.background_gradient(cmap='GnBu')\
                    .bar(subset=["max"], color='#BB0000')\
                    .bar(subset=["mean",], color='green')

####  📌 Drops, transformaciones, nuevas variables, etc...

##### 🔹 Quitar alguna variable del dataset que no vaya a ocupar?

In [None]:
df_train = df_train.drop(["X1","X2"], axis=1)

##### 🔹 Remplazar algunos titulos de alguna variable?

In [None]:
dataset['A'] = dataset['A'].replace('B', 'C')

# Si es para una combinacion
for dataset in combine:
    dataset['A'] = dataset['A'].replace('B', 'C')

##### 🔹Convertir una categorica a una ordinal?

In [None]:
title_mapping = { "Mr" : 1, "Miss" : 3, "Master" : 4, "Rare" : 5 }
for dataset in combine:
    dataset["Title"] = dataset["Title"].map(title_mapping)
    dataset["Title"] = dataset["Title"].fillna(0)
    

##### 🔹 Parsear variables categoricas a enteros?

In [None]:
dataset['Sex'] = dataset["Sex"].map( { 'female' : 1, 'male' : 0 } ).astype(int)    

##### 🔹 Crear una nueva variable con bandas o rangos basada en una variable numerica?

In [None]:
df_train['AgeBand'] = pd.cut(df_train['Age'], 5) # To classify ages within one of 5 bins/bands

##### 🔹 Quieres rellenar valores vacíos con la coincidencia mas comun?

In [None]:
freq_port = df_train.Embarked.dropna().mode()[0] # mode() the most current value in Embarked
freq_port

dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)

##### 🔹 Convertir variable categorica en numerica?

In [None]:
dataset['Embarked'] = dataset['Embarked'].map({ 'S':0, 'C':1, 'Q':2 }).astype(int)

##### 🔹 Rellenar valores nulos de una variable numerica con su media o mediana?

In [None]:
df_test['Fare'].fillna(df_test['Fare'].dropna().median(), inplace=True)

##### 🔹 Visualizemos los outliers?

In [None]:
# Standardizing data
saleprice_scaled = StandardScaler().fit_transform(df_train['SalePrice'][:, np.newaxis]) # Converting data values to have mean 0 and sd of 1
low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10] # Argsort return an array index of sort values of first 10 values
high_range = saleprice_scaled[saleprice_scaled[:, 0].argsort()][-10:] # Argsort return an array index of sort values of last 10 values
print(low_range,'\n')
print(high_range,'\n')

### 👉 Correlaciones!!!

##### 🔹Matriz de correlaciones

In [None]:
# Method 1

# import plotly.express as px
fig = px.imshow(train.corr(), text_auto=True, aspect="auto", color_continuous_scale="blackbody")
fig.show()

# Method 2 using seaborn
corrmat = df_train.corr()
f, ax = plt.subplots(figsize=(12,9))
sb.heatmap(corrmat, vmax=.8, square=True)

# Method 3 using seaborn but limiting number of variables (Top 10 more correalted)
k = 10 # Number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index # Sort correlations by highest value
cm = np.corrcoef(df_train[cols].values.T) # Param must be transposed
sb.set(font_scale=1.25) # Font size of graph variables
hm = sb.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size' : 12}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()



##### 🔹¿Quieres ver un scatter plot múltiple?

In [None]:
# scatterplot
sb.set() # Apply default seaborn theme
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sb.pairplot(df_train[cols], size = 2.5)
plt.show()

### 👉 Data preprocesing

### 👉 Modeling

In [3]:
TARGET = 'status'
FEATURES = [col for col in df.columns if col != TARGET]
RANDOM_STATE = 12

X = df.drop(TARGET, axis = 1)
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=12,
                                                    test_size= 0.33)
clf = LazyClassifier(verbose=0,
                     ignore_warnings=True,
                     custom_metric=None,
                     predictions=False,
                     random_state=12,
                     classifiers='all')

models, predictions = clf.fit(X_train, X_test, y_train, y_test)
clear_output()



NameError: name 'df' is not defined

In [None]:
models[:15]