In [283]:
from tensorflow.keras.layers import Input, Dense, Embedding, Conv1D, LeakyReLU, Flatten, Dropout
from tensorflow.keras.models import Model

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
import seaborn as sns

In [127]:
df = pd.read_csv('../data/clean_data.csv', index_col='Municipality')

# Get proper label
labels = df.filter(like='catalan')
label = df.filter(like='catalan').filter(like='total')

data = df.drop(list(labels.columns), axis=1)

In [128]:
label[:] = pd.qcut(label['culture_knowledge_of_catalan_total'], q=4, labels=[0,1,2,3])
# label[label['culture_knowledge_of_catalan_total'] == 2] = 1
# label[label['culture_knowledge_of_catalan_total'] == 3] = 2

In [129]:
label_data = pd.get_dummies(label['culture_knowledge_of_catalan_total'])
label_data_size = label_data.shape

In [132]:
data = data.drop(['missing_count'], axis=1)
data['population_population_by_sex_main'] = pd.Categorical(data['population_population_by_sex_main'])
data['population_population_by_sex_main'] = data['population_population_by_sex_main'].cat.codes.astype(float)

data['economic_sectors_head_of_livestock_main'] = pd.Categorical(data['economic_sectors_head_of_livestock_main'])
data['economic_sectors_head_of_livestock_main'] = data['economic_sectors_head_of_livestock_main'].cat.codes.astype(float)

data['economic_sectors_cultivated_land_main'] = pd.Categorical(data['economic_sectors_cultivated_land_main'])
data['economic_sectors_cultivated_land_main'] = data['economic_sectors_cultivated_land_main'].cat.codes.astype(float)

data['culture_sports_facilities_main'] = pd.Categorical(data['culture_sports_facilities_main'])
data['culture_sports_facilities_main'] = data['culture_sports_facilities_main'].cat.codes.astype(float)

In [133]:
def normalize(x):
    return (x - x.min()) / (x.max() - x.min())

In [134]:
for col in data.columns:
    data[col] = normalize(data[col])

In [161]:
train_x, test_x, train_y, test_y = train_test_split(data, label_data, test_size=0.2,random_state=42)
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2,random_state=42)

In [265]:
PARAMS = {'LAYERS': [[1],
                     [10],
                     [100],
                     [100, 10]],
          'ALPHA': [0.1,
                    0.2,
                    0.3,
                    0.4],
          'DROPOUT': [0.1,
                      0.2,
                      0.3,
                      False],
          'BATCH_SIZE': [10,
                         20,
                         30]}

Empty DataFrame
Columns: [Model, F1score - Val, F1score - train, MASK_ZERO, ALPHA, DROPOUT, LAYERS, BATCH_SIZE]
Index: []


In [277]:
DROPOUT = 0.1
ALPHA = 0.1
LAYERS = [100, 10]
BATCH_SIZE = 10
INPUT_SHAPE = data.shape[1]

def build_model(layers, dropout, alpha):
    inpt = Input(shape=INPUT_SHAPE)

    x = Flatten()(inpt)

    for layer in layers:
        x = Dense(layer, activation=LeakyReLU(alpha=alpha))(x)
        if dropout != False:
            x = Dropout(dropout)(x)


    out = Dense(4, activation="softmax")(x)

    model = Model(inpt, out)
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=["accuracy"])

    return model

In [302]:
m = build_model(LAYERS, DROPOUT, ALPHA)
m.summary()
m.fit(train_x, train_y, batch_size=BATCH_SIZE, validation_data=(val_x,val_y), epochs=100)

Model: "model_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_20 (InputLayer)       [(None, 132)]             0         
                                                                 
 flatten_12 (Flatten)        (None, 132)               0         
                                                                 
 dense_52 (Dense)            (None, 100)               13300     
                                                                 
 dropout_6 (Dropout)         (None, 100)               0         
                                                                 
 dense_53 (Dense)            (None, 10)                1010      
                                                                 
 dropout_7 (Dropout)         (None, 10)                0         
                                                                 
 dense_54 (Dense)            (None, 4)                 44 

<keras.callbacks.History at 0x1d36b6343a0>

In [300]:
mlp = MLPClassifier(alpha=0,
                           activation='logistic', max_iter=1000,
                           solver='adam',random_state=42)
mlp.fit(train_x,train_y)
y_pred = mlp.predict(val_x)

In [301]:
def confusion_matrix(pred, true):
    pred = pd.Series(np.argmax(pred, axis=1))
    true = pd.Series(true.columns[np.where(true!=0)[1]])

    true.name = 'target'
    pred.name = 'predicted'
    cm = pd.crosstab(true.reset_index(drop=True), pred.reset_index(drop=True))
    cm = cm[cm.index]
    return cm

In [303]:
confusion_matrix(y_pred, test_y)

KeyError: '[1, 2] not in index'

In [281]:
pred = pd.Series(np.argmax(m.predict(test_x), axis=1))
true = pd.Series(test_y.columns[np.where(test_y!=0)[1]])
f1_score(list(true.values), list(pred.values), average=None)



array([0.74509804, 0.4691358 , 0.45360825, 0.73584906])