In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt

In [2]:
# Cargar el archivo de datos en un DataFrame de Pandas
df = pd.read_csv('./titanic.csv')

In [None]:
# Mostrar el DataFrame
df

In [None]:
print(df)

In [None]:
# Concoer el tamaño del DataFrame
df.shape

In [None]:
# Conocer las columnas del DataFrame
df.columns

In [None]:
# Concoer los tipos de datos del DataFrame
df.dtypes

In [None]:
# Obtener información básica de los datos
df.describe()

In [None]:
# Obtener descripciones de datos no numéricos
df.describe(include='all')

In [3]:
# Eliminación de columnas del DataFrame
df = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [4]:
# Eliminar filas con valores perdidos
df = df.dropna()
df.shape

(712, 8)

In [5]:
# Convertir valores categóricos a numéricos
df['Sex'] = df['Sex'].astype('category').cat.codes
df['Embarked'] = df['Embarked'].astype('category').cat.codes
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.2500,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.9250,2
3,1,1,0,35.0,1,0,53.1000,2
4,0,3,1,35.0,0,0,8.0500,2
...,...,...,...,...,...,...,...,...
885,0,3,0,39.0,0,5,29.1250,1
886,0,2,1,27.0,0,0,13.0000,2
887,1,1,0,19.0,0,0,30.0000,2
889,1,1,1,26.0,0,0,30.0000,0


In [6]:
# Guardar el DataFrame en un archivo
df.to_csv('./titanic_ml.csv', index=False)

In [7]:
# Función que recibe un DataFrame, una proporcion y el nombre de la clase,
# y genera cuatro conjuntos de datos para entrenamiento
# y pruebas del algoritmo de aprendizaje
from sklearn.model_selection import train_test_split

def split_label(df, test_size, label):
    train, test = train_test_split(df, test_size=test_size)
    features = df.columns.drop(label)
    train_X = train[features]
    train_Y = train[label]
    test_X = test[features]
    test_Y = test[label]
    return train_X, train_Y, test_X, test_Y

In [8]:
# Dividir el conjunto de datos de pasajeros del Titanic
train_X, train_Y, test_X, test_Y = split_label(df, 0.2, 'Survived')

In [9]:
# Aplicar one hot encodign a la columna "Embarked"
from sklearn.preprocessing import OneHotEncoder

one = OneHotEncoder(handle_unknown='ignore')

result = one.fit_transform(train_X['Embarked'].values.reshape(-1, 1)).toarray()
train_X_1 = train_X
train_X_1[['C', 'Q', 'S']] = pd.DataFrame(result, index = train_X_1.index)
train_X_1

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,C,Q,S
535,2,0,7.0,0,2,26.2500,2,0.0,0.0,1.0
321,3,1,27.0,0,0,7.8958,2,0.0,0.0,1.0
556,1,0,48.0,1,0,39.6000,0,1.0,0.0,0.0
209,1,1,40.0,0,0,31.0000,0,1.0,0.0,0.0
234,2,1,24.0,0,0,10.5000,2,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
651,2,0,18.0,0,1,23.0000,2,0.0,0.0,1.0
328,3,0,31.0,1,1,20.5250,2,0.0,0.0,1.0
663,3,1,36.0,0,0,7.4958,2,0.0,0.0,1.0
310,1,0,24.0,0,0,83.1583,0,1.0,0.0,0.0


In [10]:
# Aplicar escalamiento [0,1] a todos los valores
from sklearn.preprocessing import MinMaxScaler
min_max = MinMaxScaler()
train_X_2 = min_max.fit_transform(train_X_1.values)
train_X_2 = pd.DataFrame(train_X_2, columns=train_X_1.columns)
train_X_2

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,C,Q,S
0,0.5,0.0,0.086322,0.0,0.333333,0.051237,1.0,0.0,0.0,1.0
1,1.0,1.0,0.359062,0.0,0.000000,0.015412,1.0,0.0,0.0,1.0
2,0.0,0.0,0.645438,0.2,0.000000,0.077294,0.0,1.0,0.0,0.0
3,0.0,1.0,0.536343,0.0,0.000000,0.060508,0.0,1.0,0.0,0.0
4,0.5,1.0,0.318151,0.0,0.000000,0.020495,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
564,0.5,0.0,0.236329,0.0,0.166667,0.044893,1.0,0.0,0.0,1.0
565,1.0,0.0,0.413610,0.2,0.166667,0.040062,1.0,0.0,0.0,1.0
566,1.0,1.0,0.481795,0.0,0.000000,0.014631,1.0,0.0,0.0,1.0
567,0.0,0.0,0.318151,0.0,0.000000,0.162314,0.0,1.0,0.0,0.0


In [11]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(train_X_2, train_Y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [12]:
result = one.fit_transform(test_X['Embarked'].values.reshape(-1, 1)).toarray()
test_X_1 = test_X
test_X_1[['C', 'Q', 'S']] = pd.DataFrame(result, index = test_X_1.index)
test_X_2 = min_max.fit_transform(test_X_1.values)
test_X_2 = pd.DataFrame(test_X_2, columns=test_X_1.columns)
clf.predict(test_X_2)

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], dtype=int64)

In [13]:
clf.score(test_X_2, test_Y)

0.8181818181818182

In [14]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(train_X_2, train_Y)
clf.score(test_X_2, test_Y)

0.8321678321678322

In [15]:
# Transformación del DataFrame original
result = one.fit_transform(df['Embarked'].values.reshape(-1, 1)).toarray()
df_1 = df
df_1[['C', 'Q', 'S']] = pd.DataFrame(result, index = df_1.index)
df_2 = min_max.fit_transform(df_1.values)
df_2 = pd.DataFrame(df_2, columns=df_1.columns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [16]:
# Uso de clasificación por KMeans
from sklearn.cluster import KMeans
clu = KMeans(n_clusters=3)
clu.fit(df_2)
clu.cluster_centers_

array([[-2.60902411e-15,  7.77628032e-01,  8.49056604e-01,
         3.75477998e-01,  1.11051213e-01,  6.01976640e-02,
         3.85185852e-02,  9.73045822e-01,  1.22124533e-15,
         5.39083558e-02,  9.46091644e-01],
       [ 9.90521327e-01,  4.95260664e-01,  3.27014218e-01,
         3.43576883e-01,  9.95260664e-02,  9.47867299e-02,
         7.78150129e-02,  9.81042654e-01,  3.33066907e-16,
         3.79146919e-02,  9.62085308e-01],
       [ 6.07692308e-01,  3.73076923e-01,  5.30769231e-01,
         3.81939799e-01,  8.46153846e-02,  6.92307692e-02,
         1.33306411e-01,  1.55431223e-15,  1.00000000e+00,
        -9.02056208e-17,  1.22124533e-15]])

In [17]:
clu.labels_

array([0, 2, 1, 1, 0, 0, 0, 1, 2, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 2, 1, 0, 0, 0, 2, 1, 2, 1, 2, 1,
       0, 2, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 2, 2, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0,
       2, 0, 0, 0, 2, 0, 0, 2, 1, 0, 2, 1, 0, 2, 0, 0, 1, 0, 2, 1, 0, 0,
       2, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 2, 0, 2, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       2, 2, 0, 0, 0, 0, 2, 1, 0, 0, 2, 1, 2, 0, 1, 0, 0, 2, 1, 0, 2, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 2, 1, 2, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2,
       1, 0, 0, 1, 0, 0, 0, 1, 2, 1, 0, 1, 1, 1, 2, 2, 0, 0, 2, 1, 2, 0,
       1, 2, 2, 2, 2, 2, 0, 0, 0, 1, 1, 0, 1, 2, 0, 0, 1, 1, 2, 0, 1, 1,
       2, 0, 0, 0, 0, 2, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 2, 0, 0, 1,
       0, 0, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2, 0, 1, 2,

In [18]:
from sklearn.metrics import silhouette_score
silhouette_score(df_2, clu.labels_)

0.4193037614631971

In [19]:
from sklearn.metrics import calinski_harabasz_score
calinski_harabasz_score(df_2, clu.labels_)

454.4468770622059