#### Importando as bibliotecas necessárias

In [575]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

#### Lendo o arquivo .csv com os dados

In [576]:
data = pd.read_csv('adm_data.csv')  

#### Renomeando colunas para tirar espaços desnecessários e facilitar o acesso a elas

In [577]:
data.rename(columns = {'Chance of Admit ': 'Chance of Admit'}, inplace = True)
data.rename(columns = {'LOR ': 'LOR'}, inplace = True)

#### Visualizando os dados

In [578]:
data.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


#### Retirando a coluna 'Serial No.', que não será utilizada

In [579]:
data1 = data.drop(columns = ['Serial No.'])

#### Definindo features e label

In [580]:
label = ['Chance of Admit']
features = list(set(data1.columns).difference(label))

In [581]:
features

['TOEFL Score',
 'SOP',
 'CGPA',
 'LOR',
 'GRE Score',
 'University Rating',
 'Research']

#### Definindo X e y

In [582]:
X = data[features]
y = data[label]

#### Split dos dados em train e test

In [583]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

#### Scaling nos dados para obter features no intervalo [0, 1]

In [584]:
X_train_scaled = (X_train - X_train.min())/(X_train.max() - X_train.min())
X_test_scaled = (X_test - X_train.min())/(X_train.max() - X_train.min())
# y_train e y_test já estão na escala correta, pois são probabilidades.
y_train_scaled = y_train
y_test_scaled = y_test

#### Criando funções para nos ajudar a definir e treinar o modelo

In [585]:
def create_model(first_layer_units = 32, alpha = 0.1, reg_lambda = 0.1, features = []):
    '''Create a defined model with learning rate = alpha.'''
    # We will be using Tensorflow's Sequencial API to build our model.
    model = tf.keras.models.Sequential()
    
    # Adding first layer with input_dim = number of features and relu activation function 
    # to learn some non-linearities. We will also add L2 regularization in layer's kernel (weights).
    model.add(tf.keras.layers.Dense(units = first_layer_units, input_shape = (len(features), ), 
                                   kernel_regularizer = tf.keras.regularizers.L2(l2 = reg_lambda)))
    
    # Adding output layer with linear activation (linear regression output).
    
    model.add(tf.keras.layers.Dense(units = 1, activation = 'linear'))
    
    # Compiling model: selecting optimizer, loss and metrics.
    
    model.compile(optimizer = tf.keras.optimizers.RMSprop(learning_rate = alpha),
                 loss = 'mean_squared_error',
                 metrics = tf.keras.metrics.MeanSquaredError())
    
    return model

In [586]:
def train_model(model, X_train, y_train, epochs,
               batch_size = None, validation_split = 0.2):
    history = model.fit(x = X_train, y = y_train, batch_size = batch_size,
         epochs = epochs, shuffle = True, 
         validation_split = validation_split)
    
    hist = pd.DataFrame(history.history)
    train_mse = np.array(hist['mean_squared_error'])
    val_mse = np.array(hist['val_mean_squared_error'])
    return train_mse, val_mse

In [587]:
model = create_model(first_layer_units = 32, alpha = 0.001, reg_lambda = 0.05, features = features)

#### Definindo hiperparâmetros

In [588]:
epochs = 30
batch_size = 10

#### Definindo global random state do Tensorflow para gerar resultados reprodutíveis

In [589]:
tf.random.set_seed(0)

#### Treino e validação de nosso primeiro modelo

In [590]:
train_mse, val_mse = train_model(model, X_train_scaled, y_train_scaled, epochs = epochs, batch_size = batch_size)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


- Notamos um comportamento à primeira vista "estranho": os valores da função loss, que foi escolhida como mean_squared_error, está diferente da métrica mean_squared_error. Isso ocorre porque o cálculo de loss ocorre fazendo a média aritmética das somas dos quadrados dos erros e o cálculo da métrica ocorre fazendo a média ponderada desse valor.

In [591]:
def test_evaluation(model, X_test, y_test, batch_size):
    return model.evaluate(X_test, y_test, batch_size = batch_size)

In [592]:
test_evaluation(model, X_test_scaled, y_test_scaled, batch_size = 5)



[0.00607100548222661, 0.004658360965549946]

Parece que o modelo se saiu muito bem, pois os valores das métricas nos datasets de treino, validação e de teste estão muito próximas e bastante pequenas, chegando a valores próximos daqueles que obtivemos por meio do modelo de regressão linear feito from scratch.

#### Agora que encontramos valores bons para os hiperparâmetros, bamos testar  a performance do modelo para vários valores de nodes na first layer do modelo, verificando como o resultado se comporta em função da complexidade.

In [593]:
unit_values = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]

In [594]:
train_final_mse = {}
val_final_mse = {}
test_final_mse = {}
for first_layer_unit in unit_values:
    model = create_model(first_layer_units = first_layer_unit, alpha = 0.001, 
                          reg_lambda = 0.1, features = features)
    train_mse, val_mse = train_model(model, X_train_scaled, y_train_scaled, epochs = epochs, batch_size = batch_size)
    train_final_mse[first_layer_unit] = train_mse[-1]
    val_final_mse[first_layer_unit] = val_mse[-1]
    test_eval = test_evaluation(model, X_test_scaled, y_test_scaled, batch_size = 5)
    test_final_mse[first_layer_unit] = test_eval[1]

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30


Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30


Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30


Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30


Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30




In [595]:
for unit in unit_values:
    print(f'{unit}:')
    print(f'Train final mse: {train_final_mse[unit]}')
    print(f'Val final mse: {val_final_mse[unit]}')
    print(f'Test final mse: {test_final_mse[unit]}')

2:
Train final mse: 0.009028051979839802
Val final mse: 0.009981783106923103
Test final mse: 0.007641382049769163
4:
Train final mse: 0.005511012393981218
Val final mse: 0.0054767238907516
Test final mse: 0.00510556111112237
8:
Train final mse: 0.005059545394033194
Val final mse: 0.004739250522106886
Test final mse: 0.004671732895076275
16:
Train final mse: 0.005383595358580351
Val final mse: 0.004907228518277407
Test final mse: 0.004769839346408844
32:
Train final mse: 0.005644068121910095
Val final mse: 0.004995498340576887
Test final mse: 0.004836257081478834
64:
Train final mse: 0.005997497122734785
Val final mse: 0.005255778320133686
Test final mse: 0.005046100355684757
128:
Train final mse: 0.006620506290346384
Val final mse: 0.00569882383570075
Test final mse: 0.0054243626073002815
256:
Train final mse: 0.007704206742346287
Val final mse: 0.00649326341226697
Test final mse: 0.006070998031646013
512:
Train final mse: 0.00857462827116251
Val final mse: 0.007246413733810186
Test fi

Vemos, através desse experimento, um fato curioso, porém esperado:

- O valor do erro final do modelo tende a ser maior se temos um modelo exageradamente complexo, enquanto que modelos não tão complexos, como para units = 2 e units = 16, o modelo performa relativamente bem. Isso acontece porque quando aumentamos a complexidade do modelo, ele tende a sofrer overfitting nos dados de treino, perdendo sua capacidade de generalização. Além disso, o valor "ótimo" de units é data-dependent, logo, não conseguimos prever antecipadamente qual dos valores de units será o melhor - apenas sabemos que não deve ser um valor extremamente grande, dado que estamos resolvendo um problema em que a regressão linear se encaixa muito bem devido às altas correlações das features com o target.

#### Note que de fato as correlações entre a variável dependente (target) e as features são elevadas:

In [596]:
final_data = X_train.assign(Chance_of_admit  = y_train)
final_data = final_data.rename(columns = {'Chance_of_admit': 'Chance of Admit'})

In [597]:
final_data.corr()['Chance of Admit']

TOEFL Score          0.800120
SOP                  0.678582
CGPA                 0.889916
LOR                  0.681426
GRE Score            0.822623
University Rating    0.732731
Research             0.549642
Chance of Admit      1.000000
Name: Chance of Admit, dtype: float64

### Modelo final

Para nosso modelo final, escolheremos first_layer_units = 16, pois foi o valor que performou melhor.

In [598]:
final_model = create_model(first_layer_units = 16, alpha = 0.001, reg_lambda = 0.05, features = features)

In [599]:
epochs = 30
batch_size = 10

In [600]:
fm_train_mse, fm_val_mse = train_model(final_model, X_train_scaled, y_train_scaled, epochs = epochs, batch_size = batch_size)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [603]:
fm_test_eval = test_evaluation(final_model, X_test_scaled, y_test_scaled, batch_size = 5)
print(f'\nMetrics for our final linear regression model made with NN in Tensorflow Keras API:\n')
print(f'Final model train mse: {fm_train_mse[-1]}')
print(f'Final model val mse: {fm_val_mse[-1]}')
print(f'Final model test mse: {fm_test_eval[1]}')


Metrics for our final linear regression model made with NN in Tensorflow Keras API:

Final model train mse: 0.004744006786495447
Final model val mse: 0.00444501917809248
Final model test mse: 0.004492971580475569
