## Bibliotecas

In [323]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importando o dataset

In [324]:
data = pd.read_csv('adm_data.csv')
data.rename(columns = {'Chance of Admit ': 'Chance of Admit'}, inplace = True)
data.rename(columns = {'LOR ': 'LOR'}, inplace = True)

In [325]:
data.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [326]:
features = ['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA', 'Research']
label = 'Chance of Admit'

### Mudando a escala dos dados para [0, 1]

Transformaremos [min, max] -> [0, 1] para cada coluna.

In [327]:
def scale(feature):
    minimum = min(feature)
    scaled_feature = feature.apply(lambda x: x-minimum)
    scaled_maximum = max(scaled_feature)
    return scaled_feature.apply(lambda x: x/scaled_maximum)

In [328]:
data_scaled = data.copy()
for feature in features:
    data_scaled[feature] = scale(data[feature])

In [329]:
data_scaled.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,0.94,0.928571,0.75,0.875,0.875,0.913462,1.0,0.92
1,2,0.68,0.535714,0.75,0.75,0.875,0.663462,1.0,0.76
2,3,0.52,0.428571,0.5,0.5,0.625,0.384615,1.0,0.72
3,4,0.64,0.642857,0.5,0.625,0.375,0.599359,1.0,0.8
4,5,0.48,0.392857,0.25,0.25,0.5,0.451923,0.0,0.65


### Correlação dos dados com a chance de admissão

In [350]:
data_scaled.corr()['Chance of Admit']

Serial No.           0.042336
GRE Score            0.802610
TOEFL Score          0.791594
University Rating    0.711250
SOP                  0.675732
LOR                  0.669889
CGPA                 0.873289
Research             0.553202
Chance of Admit      1.000000
Name: Chance of Admit, dtype: float64

Temos, de fato, dados altamente correlacionados com a chance de admissão dos estudantes nas universidades escolhidas.

## Modelo de Regressão Linear Anteriormente Implementado

In [331]:
class Complete_Linear_Regression_Model():
    def __init__(self, train_data, features_name: list, label_name: str, ws: list, b: float, alpha: float, random_state: int):
        self.train_data = train_data
        self.features = train_data[features_name]
        self.label = train_data[label_name]
        self.label_name = label_name # saving here because label as a series doesn't save it.
        for i in range(0, len(ws)):
            ws[i] = float(ws[i])
        self.ws = np.array(ws) # weights
        self.b = b
        self.alpha = alpha
        self.rand = np.random.RandomState(random_state)
        
    def print_parameters(self):
        for i in range(1, len(self.ws) + 1):
            print(f'w{i} = {self.ws[i - 1]}')
        print (f'b = {self.b}')
        
    def get_single_prediction(self, xs: list):
        '''Get the prediction for a list with all the features' values.'''
        for i in range(0, len(xs)):
            xs[i] = float(xs[i])
        xs = np.array(xs)
        pred = np.dot(self.ws, xs) + self.b
        return pred
    
    def predict(self, data):
        n = len(data)
        features_name = self.features.columns
        data_features = data[features_name]
        predictions = np.zeros(n)
        for i in range(0, n):
            xs = list(data_features.iloc[i])
            predictions[i] += self.get_single_prediction(xs)
        return predictions
    
    def get_loss(self, data):
        n = len(data)
        data_label = data[self.label_name]
        y = np.array(data_label)
        predictions = self.predict(data)
        diff = predictions - y
        loss = (1/n)*np.dot(diff, diff)
        return loss
               
    def sgd_update_parameters(self, batch_size: int):
        n = len(self.label)
        index_list = list(range(0, n))
        random_indices = self.rand.choice(index_list, size = batch_size, replace = True) # bootstrap sample
        xs_sample = list()
        y_sample = np.array(self.label.iloc[random_indices])
        preds_sample = np.zeros(batch_size)
        for i in range(0, batch_size):
            xs = list(self.features.iloc[random_indices[i]])
            preds_sample[i] += self.get_single_prediction(xs)
        for col in self.features:
            xs_sample.append(np.array(self.features[col].iloc[random_indices])) # len(xs_sample) = len(self.ws)
        diff_sample = preds_sample - y_sample
        partial_w = np.zeros(len(self.ws))
        for i in range(0, len(self.ws)):
            partial_w[i] += (2/batch_size) * np.dot(diff_sample, xs_sample[i])
        partial_b = (2/batch_size) * np.sum(diff_sample)
        self.ws -= self.alpha * partial_w
        self.b -= self.alpha * partial_b
        
    def sgd(self, iterations: int, batch_size: float, print_loss: bool): # stochastic gradient descent
        for i in range(0, iterations):
            self.sgd_update_parameters(batch_size)
            if print_loss:
                print(f'loss = {self.get_loss(self.train_data)}')
    
    @staticmethod
    def shuffle_data(data, random_state):
        rand = np.random.RandomState(random_state)
        return data.reindex(rand.permutation(data.index))
    
    @staticmethod
    def train_val_test_split(data, test_split: float, val_split: float):
        '''Get train, validation and test dataframes from data.'''
        n = len(data)
        test_size = int(test_split * n)
        val_size = int(val_split * n)
        test_data = data.iloc[list(range(0, test_size))]
        val_data = data.iloc[list(range(test_size, test_size + val_size))]
        train_data = data.iloc[list(range(test_size + val_size, n))]
        return train_data, val_data, test_data

## Treinamento, Validação e Teste

Vamos treinar o modelo no dataset que temos. Para tal, iremos dividir o dataframe 'data' em três partes: training, validation e test. A primeira parte será utilizada para o treinamento do modelo; a segunda, para validar o modelo em um conjunto de dados diferentes daqueles usados no treinamento e, se necessário, para corrigir os parâmetros do modelo; e a terceira será utilizada como uma forma de teste final para o modelo, evitando o overfitting nos dados de validação.

### Função para separar os dados em treinamento, validação e teste

Tal função está implementada dentro da classe Complete_Linear_Regression_Model, juntamente da função shuffle_data, utilizada para embaralhar os dados no dataframe.

In [332]:
random_state = 0
data_shuffled = Complete_Linear_Regression_Model.shuffle_data(data_scaled, random_state = random_state)

In [333]:
train_data, val_data, test_data = Complete_Linear_Regression_Model.train_val_test_split(data_shuffled, test_split = 0.2, val_split = 0.2)

In [334]:
train_data.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
281,282,0.54,0.642857,0.5,0.75,0.875,0.740385,1.0,0.8
133,134,0.66,0.714286,1.0,0.75,0.875,0.634615,0.0,0.79
33,34,1.0,0.785714,1.0,0.75,0.75,0.897436,1.0,0.9
378,379,0.26,0.214286,0.0,0.25,0.375,0.272436,0.0,0.56
162,163,0.56,0.607143,0.5,0.5,0.5,0.544872,0.0,0.67


In [335]:
train_data.shape

(240, 9)

In [336]:
val_data.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
336,337,0.58,0.642857,0.5,0.5,0.375,0.637821,0.0,0.72
64,65,0.7,0.678571,0.5,0.5,0.625,0.608974,0.0,0.52
55,56,0.6,0.392857,0.5,0.5,0.5,0.288462,0.0,0.64
106,107,0.78,0.678571,0.75,0.875,0.875,0.762821,1.0,0.87
300,301,0.38,0.5,0.25,0.375,0.375,0.384615,0.0,0.62


In [337]:
val_data.shape

(80, 9)

In [338]:
test_data.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
132,133,0.38,0.464286,1.0,0.625,0.625,0.564103,0.0,0.71
309,310,0.36,0.642857,0.75,0.625,0.5,0.576923,0.0,0.7
341,342,0.72,0.642857,0.5,0.625,0.625,0.628205,1.0,0.79
196,197,0.32,0.464286,0.25,0.5,0.375,0.467949,0.0,0.73
246,247,0.52,0.464286,0.5,0.5,0.625,0.61859,0.0,0.72


In [339]:
test_data.shape

(80, 9)

### Treinando o modelo

In [340]:
ws = list(np.zeros(len(features)))
b = 0
alpha = 0.1

In [341]:
model = Complete_Linear_Regression_Model(train_data = train_data, features_name = features,
                                        label_name = label, ws = ws, b = b, alpha = alpha,
                                        random_state = random_state)

In [342]:
model.get_loss(train_data)

0.5472704166666666

In [343]:
model.sgd(iterations = 100, batch_size = 10, print_loss = False)

In [344]:
model.get_loss(train_data)

0.004383113932743791

### Aplicando o modelo nos dados de validação

In [345]:
model.predict(val_data)

array([0.70300941, 0.74718055, 0.63321292, 0.87350268, 0.60160892,
       0.77625128, 0.61935197, 0.72724331, 0.79030282, 0.84149826,
       0.76415246, 0.93123656, 0.58296967, 0.51828254, 0.85411509,
       0.63484656, 0.77190742, 0.59823798, 0.72877566, 0.73720078,
       0.68955423, 0.71427937, 0.80395073, 0.49724794, 0.74434964,
       0.69298895, 0.98097558, 0.61524728, 0.73644651, 0.63462179,
       0.59388607, 0.78174869, 0.70295131, 0.70785395, 0.84301344,
       0.86539436, 0.93176713, 0.78182126, 0.64314142, 0.61609948,
       0.58848724, 0.69151731, 0.89526426, 0.65743089, 0.46288212,
       0.87724759, 0.94373314, 0.67706468, 0.71246377, 0.57078048,
       0.88596918, 0.67706117, 0.76654186, 0.53874916, 0.78494595,
       0.62577697, 0.84294774, 0.71546917, 0.53596079, 0.70201763,
       0.84950451, 0.72671464, 0.59354043, 0.69176346, 0.70034738,
       0.72330833, 0.57522961, 0.78753795, 0.86251808, 0.66283263,
       0.7755266 , 0.96784311, 0.74636214, 0.68609704, 0.92605

In [346]:
model.get_loss(val_data)

0.004627083161208463

As perdas relativas à train_data e à val_data estão muito próximas e estão suficientemente baixas. Logo, falta somente conferirmos se o modelo não está overfitted por meio test_data.

### Aplicando o modelo nos dados de teste

In [347]:
model.predict(test_data)

array([0.70346951, 0.70113158, 0.78044913, 0.614065  , 0.70126604,
       0.60622974, 0.6919436 , 0.6795312 , 0.87287364, 0.93200079,
       0.51053574, 0.88008498, 0.73572575, 0.48124326, 0.86093245,
       0.58780857, 0.63227829, 0.79449464, 0.61555213, 0.75175424,
       0.90725539, 0.8687526 , 0.6395799 , 0.45455306, 0.80568879,
       0.60312979, 0.51586619, 0.65419442, 0.91463629, 0.68790263,
       0.64107827, 0.74931917, 0.77293924, 0.54184161, 0.78204203,
       0.76393588, 0.71706472, 0.86018388, 0.62771002, 0.92938096,
       0.74224877, 0.67731318, 0.73976953, 0.81768445, 0.84683643,
       0.69919024, 0.58288675, 0.74252138, 0.59141317, 0.59754546,
       0.65296059, 0.77816832, 0.61979498, 0.90403542, 0.73962892,
       0.71599964, 0.78981533, 0.75702768, 0.78501394, 0.85225518,
       0.75512333, 0.43968026, 0.61615326, 0.53622029, 0.85789097,
       0.78456132, 0.70294371, 0.86489588, 0.74641866, 0.72868304,
       0.6280323 , 0.84186953, 0.79621817, 0.63351699, 0.91484

In [348]:
model.get_loss(test_data)

0.0048633136611483056

O resultado está bem coerente e configura uma ótima predição. Assim, conseguimos treinar o modelo, validá-lo e testá-lo com sucesso!

### Parâmetros finais do modelo

In [349]:
model.print_parameters()

w1 = 0.11879119170129392
w2 = 0.12178581623485109
w3 = 0.03466165172927982
w4 = 0.08296755000680148
w5 = 0.12104512820940831
w6 = 0.16274519530202722
w7 = 0.021741583370602384
b = 0.34781088671437677
