## Bibliotecas

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importando o dataset

In [2]:
data = pd.read_csv('adm_data.csv')
data.rename(columns = {'Chance of Admit ': 'Chance of Admit'}, inplace = True)
data.rename(columns = {'LOR ': 'LOR'}, inplace = True)

In [3]:
data.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


### Mudando a escala de alguns dados

In [4]:
scale_factor = 100
data['GRE Score'] = data['GRE Score']/scale_factor
data['TOEFL Score'] = data['TOEFL Score']/scale_factor

## Modelo de Regressão Linear Anteriormente Implementado

In [5]:
class Complete_Linear_Regression_Model():
    def __init__(self, train_data, features_name: list, label_name: str, ws: list, b: float, alpha: float, random_state: int):
        self.train_data = train_data
        self.features = train_data[features_name]
        self.label = train_data[label_name]
        self.label_name = label_name # saving here because label as a series doesn't save it.
        for i in range(0, len(ws)):
            ws[i] = float(ws[i])
        self.ws = np.array(ws) # weights
        self.b = b
        self.alpha = alpha
        self.rand = np.random.RandomState(random_state)
        
    def print_parameters(self):
        for i in range(1, len(self.ws) + 1):
            print(f'w{i} = {self.ws[i - 1]}')
        print (f'b = {self.b}')
        
    def get_single_prediction(self, xs: list):
        '''Get the prediction for a list with all the features' values.'''
        for i in range(0, len(xs)):
            xs[i] = float(xs[i])
        xs = np.array(xs)
        pred = np.dot(self.ws, xs) + self.b
        return pred
    
    def predict(self, data):
        n = len(data)
        features_name = self.features.columns
        data_features = data[features_name]
        predictions = np.zeros(n)
        for i in range(0, n):
            xs = list(data_features.iloc[i])
            predictions[i] += self.get_single_prediction(xs)
        return predictions
    
    def get_loss(self, data):
        n = len(data)
        data_label = data[self.label_name]
        y = np.array(data_label)
        predictions = self.predict(data)
        diff = predictions - y
        loss = (1/n)*np.dot(diff, diff)
        return loss
               
    def sgd_update_parameters(self, batch_size: int):
        n = len(self.label)
        index_list = list(range(0, n))
        random_indices = self.rand.choice(index_list, size = batch_size, replace = True) # bootstrap sample
        xs_sample = list()
        y_sample = np.array(self.label.iloc[random_indices])
        preds_sample = np.zeros(batch_size)
        for i in range(0, batch_size):
            xs = list(self.features.iloc[random_indices[i]])
            preds_sample[i] += self.get_single_prediction(xs)
        for col in self.features:
            xs_sample.append(np.array(self.features[col].iloc[random_indices])) # len(xs_sample) = len(self.ws)
        diff_sample = preds_sample - y_sample
        partial_w = np.zeros(len(self.ws))
        for i in range(0, len(self.ws)):
            partial_w[i] += (2/batch_size) * np.dot(diff_sample, xs_sample[i])
        partial_b = (2/batch_size) * np.sum(diff_sample)
        self.ws -= self.alpha * partial_w
        self.b -= self.alpha * partial_b
        
    def sgd(self, iterations: int, batch_size: float, print_loss: bool): # stochastic gradient descent
        for i in range(0, iterations):
            self.sgd_update_parameters(batch_size)
            if print_loss:
                print(f'loss = {self.get_loss(self.train_data)}')
    
    @staticmethod
    def shuffle_data(data, random_state):
        rand = np.random.RandomState(random_state)
        return data.reindex(rand.permutation(data.index))
    
    @staticmethod
    def train_val_test_split(data, test_split: float, val_split: float):
        '''Get train, validation and test dataframes from data.'''
        n = len(data)
        test_size = int(test_split * n)
        val_size = int(val_split * n)
        test_data = data.iloc[list(range(0, test_size))]
        val_data = data.iloc[list(range(test_size, test_size + val_size))]
        train_data = data.iloc[list(range(test_size + val_size, n))]
        return train_data, val_data, test_data

In [6]:
features = ['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA', 'Research']
label = 'Chance of Admit'

## Treinamento, Validação e Teste

Vamos treinar o modelo no dataset que temos. Para tal, iremos dividir o dataframe 'data' em três partes: training, validation e test. A primeira parte será utilizada para o treinamento do modelo; a segunda, para validar o modelo em um conjunto de dados diferentes daqueles usados no treinamento e, se necessário, para corrigir os parâmetros do modelo; e a terceira será utilizada como uma forma de teste final para o modelo, evitando o overfitting nos dados de validação.

### Função para separar os dados em treinamento, validação e teste

Tal função está implementada dentro da classe Complete_Linear_Regression_Model, juntamente da função shuffle_data, utilizada para embaralhar os dados no dataframe.

In [7]:
random_state = 0
data_shuffled = Complete_Linear_Regression_Model.shuffle_data(data, random_state = random_state)

In [8]:
train_data, val_data, test_data = Complete_Linear_Regression_Model.train_val_test_split(data_shuffled, test_split = 0.2, val_split = 0.2)

In [9]:
train_data.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
281,282,3.17,1.1,3,4.0,4.5,9.11,1,0.8
133,134,3.23,1.12,5,4.0,4.5,8.78,0,0.79
33,34,3.4,1.14,5,4.0,4.0,9.6,1,0.9
378,379,3.03,0.98,1,2.0,2.5,7.65,0,0.56
162,163,3.18,1.09,3,3.0,3.0,8.5,0,0.67


In [10]:
train_data.shape

(240, 9)

In [11]:
val_data.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
336,337,3.19,1.1,3,3.0,2.5,8.79,0,0.72
64,65,3.25,1.11,3,3.0,3.5,8.7,0,0.52
55,56,3.2,1.03,3,3.0,3.0,7.7,0,0.64
106,107,3.29,1.11,4,4.5,4.5,9.18,1,0.87
300,301,3.09,1.06,2,2.5,2.5,8.0,0,0.62


In [12]:
val_data.shape

(80, 9)

In [13]:
test_data.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
132,133,3.09,1.05,5,3.5,3.5,8.56,0,0.71
309,310,3.08,1.1,4,3.5,3.0,8.6,0,0.7
341,342,3.26,1.1,3,3.5,3.5,8.76,1,0.79
196,197,3.06,1.05,2,3.0,2.5,8.26,0,0.73
246,247,3.16,1.05,3,3.0,3.5,8.73,0,0.72


In [14]:
test_data.shape

(80, 9)

### Treinando o modelo

In [15]:
ws = list(np.zeros(len(features)))
b = 0
alpha = 0.001

In [16]:
model = Complete_Linear_Regression_Model(train_data = train_data, features_name = features,
                                        label_name = label, ws = ws, b = b, alpha = alpha,
                                        random_state = random_state)

In [17]:
model.get_loss(train_data)

0.5472704166666666

In [18]:
model.sgd(iterations = 100, batch_size = 10, print_loss = False)

In [19]:
model.get_loss(train_data)

0.006655338756033604

### Aplicando o modelo nos dados de validação

In [20]:
model.predict(val_data)

array([0.69848236, 0.71844728, 0.65561211, 0.82932935, 0.62362938,
       0.75015223, 0.66237513, 0.70417506, 0.7529418 , 0.7703065 ,
       0.75394732, 0.85551349, 0.61670496, 0.56529517, 0.8377201 ,
       0.65883558, 0.77130698, 0.6246004 , 0.68228305, 0.75526107,
       0.69351911, 0.70231146, 0.81701297, 0.55939935, 0.71062241,
       0.698515  , 0.87099434, 0.65961281, 0.73207872, 0.66464811,
       0.64603084, 0.7826906 , 0.72572699, 0.69058351, 0.79415777,
       0.83216588, 0.84973387, 0.74363596, 0.67578382, 0.65258466,
       0.66923501, 0.69268695, 0.85795503, 0.66100885, 0.54125408,
       0.82983629, 0.89060673, 0.68125703, 0.70176249, 0.63996663,
       0.81954537, 0.68602983, 0.72323817, 0.59489181, 0.75977509,
       0.6417404 , 0.81729976, 0.71969132, 0.62118085, 0.69996167,
       0.80918539, 0.72194338, 0.65597997, 0.70932948, 0.6706338 ,
       0.72975376, 0.59482408, 0.76281712, 0.8415524 , 0.73264085,
       0.72303635, 0.89486476, 0.75441014, 0.7139196 , 0.87230

In [21]:
model.get_loss(val_data)

0.006312462537152382

As perdas relativas à train_data e à val_data estão muito próximas e estão suficientemente baixas. Logo, falta somente conferirmos se o modelo não está overfitted por meio test_data.

### Aplicando o modelo nos dados de teste

In [22]:
model.predict(test_data)

array([0.76380663, 0.73232632, 0.73933957, 0.64760832, 0.71802513,
       0.65153022, 0.70390758, 0.70867552, 0.77203237, 0.85366786,
       0.54363197, 0.83376662, 0.71183546, 0.53943011, 0.81362594,
       0.61551608, 0.65177754, 0.73728874, 0.65365137, 0.7621989 ,
       0.8288161 , 0.8326028 , 0.70118814, 0.55284347, 0.78898156,
       0.68069219, 0.60392169, 0.68451472, 0.84170949, 0.69511577,
       0.65537477, 0.72817964, 0.78807539, 0.62245607, 0.7629988 ,
       0.7143587 , 0.6972292 , 0.81929851, 0.68170003, 0.85422289,
       0.72352378, 0.67303   , 0.71059019, 0.8011763 , 0.7851141 ,
       0.68299977, 0.59556461, 0.76732998, 0.61847904, 0.64924206,
       0.67111573, 0.7157179 , 0.6349859 , 0.83539587, 0.76517643,
       0.68280769, 0.72508215, 0.73266848, 0.8014604 , 0.84593161,
       0.76123559, 0.53395226, 0.63484215, 0.6163115 , 0.84026617,
       0.75023236, 0.65241104, 0.8246126 , 0.7279033 , 0.72533882,
       0.66480684, 0.83649297, 0.77248732, 0.64726826, 0.82198

In [23]:
model.get_loss(test_data)

0.006895500946885735

O resultado está bem coerente e configura uma ótima predição. Assim, conseguimos treinar o modelo, validá-lo e testá-lo com sucesso!

### Parâmetros finais do modelo

In [24]:
model.print_parameters()

w1 = 0.01726967237822581
w2 = 0.006037662879254167
w3 = 0.021926255829840726
w4 = 0.023212108580530855
w5 = 0.02335151638663406
w6 = 0.04981291224155402
w7 = 0.006185149735504153
b = 0.005101297614567093
