## Bibliotecas

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importando o dataset

In [10]:
data = pd.read_csv('adm_data.csv')
data.rename(columns = {'Chance of Admit ': 'Chance of Admit'}, inplace = True)
data.rename(columns = {'LOR ': 'LOR'}, inplace = True)

In [11]:
data.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [12]:
features = ['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA', 'Research']
label = 'Chance of Admit'

### Correlação dos dados com a chance de admissão

In [13]:
data.corr()['Chance of Admit']

Serial No.           0.042336
GRE Score            0.802610
TOEFL Score          0.791594
University Rating    0.711250
SOP                  0.675732
LOR                  0.669889
CGPA                 0.873289
Research             0.553202
Chance of Admit      1.000000
Name: Chance of Admit, dtype: float64

Temos, de fato, dados altamente correlacionados com a chance de admissão dos estudantes nas universidades escolhidas.

## Modelo de Regressão Linear Anteriormente Implementado

In [14]:
class Linear_Regression_Model():
    def __init__(self, train_data, features_name: list, label_name: str, ws: list, b: float, alpha: float, random_state: int):
        self.train_data = train_data
        self.features = train_data[features_name]
        self.label = train_data[label_name]
        self.label_name = label_name # saving here because label as a series doesn't save it.
        for i in range(0, len(ws)):
            ws[i] = float(ws[i])
        self.ws = np.array(ws) # weights
        self.b = b
        self.alpha = alpha
        self.rand = np.random.RandomState(random_state)
        
    def print_parameters(self):
        for i in range(1, len(self.ws) + 1):
            print(f'w{i} = {self.ws[i - 1]}')
        print (f'b = {self.b}')
        
    def get_single_prediction(self, xs: list):
        '''Get the prediction for a list with all the features' values.'''
        for i in range(0, len(xs)):
            xs[i] = float(xs[i])
        xs = np.array(xs)
        pred = np.dot(self.ws, xs) + self.b
        return pred
    
    def predict(self, data):
        n = len(data)
        features_name = self.features.columns
        data_features = data[features_name]
        predictions = np.zeros(n)
        for i in range(0, n):
            xs = list(data_features.iloc[i])
            predictions[i] += self.get_single_prediction(xs)
        return predictions
    
    def get_loss(self, data):
        n = len(data)
        data_label = data[self.label_name]
        y = np.array(data_label)
        predictions = self.predict(data)
        diff = predictions - y
        loss = (1/n)*np.dot(diff, diff)
        return loss
               
    def sgd_update_parameters(self, batch_size: int):
        n = len(self.label)
        index_list = list(range(0, n))
        random_indices = self.rand.choice(index_list, size = batch_size, replace = True) # bootstrap sample
        xs_sample = list()
        y_sample = np.array(self.label.iloc[random_indices])
        preds_sample = np.zeros(batch_size)
        for i in range(0, batch_size):
            xs = list(self.features.iloc[random_indices[i]])
            preds_sample[i] += self.get_single_prediction(xs)
        for col in self.features:
            xs_sample.append(np.array(self.features[col].iloc[random_indices])) # len(xs_sample) = len(self.ws)
        diff_sample = preds_sample - y_sample
        partial_w = np.zeros(len(self.ws))
        for i in range(0, len(self.ws)):
            partial_w[i] += (2/batch_size) * np.dot(diff_sample, xs_sample[i])
        partial_b = (2/batch_size) * np.sum(diff_sample)
        self.ws -= self.alpha * partial_w
        self.b -= self.alpha * partial_b
        
    def sgd(self, iterations: int, batch_size: float, print_loss: bool): # stochastic gradient descent
        for i in range(0, iterations):
            self.sgd_update_parameters(batch_size)
            if print_loss:
                print(f'loss = {self.get_loss(self.train_data)}')
    
    @staticmethod
    def shuffle_data(data, random_state):
        rand = np.random.RandomState(random_state)
        return data.reindex(rand.permutation(data.index))
    
    @staticmethod
    def train_val_test_split(data, test_split: float, val_split: float):
        '''Get train, validation and test dataframes from data.'''
        n = len(data)
        test_size = int(test_split * n)
        val_size = int(val_split * n)
        test_data = data.iloc[list(range(0, test_size))]
        val_data = data.iloc[list(range(test_size, test_size + val_size))]
        train_data = data.iloc[list(range(test_size + val_size, n))]
        return train_data, val_data, test_data

## Treinamento, Validação e Teste

Vamos treinar o modelo no dataset que temos. Para tal, iremos dividir o dataframe 'data' em três partes: training, validation e test. A primeira parte será utilizada para o treinamento do modelo; a segunda, para validar o modelo em um conjunto de dados diferentes daqueles usados no treinamento e, se necessário, para corrigir os parâmetros do modelo; e a terceira será utilizada como uma forma de teste final para o modelo, evitando o overfitting nos dados de validação.

### Função para separar os dados em treinamento, validação e teste

Tal função está implementada dentro da classe Linear_Regression_Model, juntamente da função shuffle_data, utilizada para embaralhar os dados no dataframe.

#### Shuffling

In [15]:
random_state = 0
data_shuffled = Linear_Regression_Model.shuffle_data(data, random_state = random_state)

#### Split

In [16]:
train_data, val_data, test_data = Linear_Regression_Model.train_val_test_split(data_shuffled, test_split = 0.2, val_split = 0.2)

In [17]:
train_data.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
281,282,317,110,3,4.0,4.5,9.11,1,0.8
133,134,323,112,5,4.0,4.5,8.78,0,0.79
33,34,340,114,5,4.0,4.0,9.6,1,0.9
378,379,303,98,1,2.0,2.5,7.65,0,0.56
162,163,318,109,3,3.0,3.0,8.5,0,0.67


In [18]:
train_data.shape

(240, 9)

In [19]:
val_data.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
336,337,319,110,3,3.0,2.5,8.79,0,0.72
64,65,325,111,3,3.0,3.5,8.7,0,0.52
55,56,320,103,3,3.0,3.0,7.7,0,0.64
106,107,329,111,4,4.5,4.5,9.18,1,0.87
300,301,309,106,2,2.5,2.5,8.0,0,0.62


In [20]:
val_data.shape

(80, 9)

In [21]:
test_data.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
132,133,309,105,5,3.5,3.5,8.56,0,0.71
309,310,308,110,4,3.5,3.0,8.6,0,0.7
341,342,326,110,3,3.5,3.5,8.76,1,0.79
196,197,306,105,2,3.0,2.5,8.26,0,0.73
246,247,316,105,3,3.0,3.5,8.73,0,0.72


In [22]:
test_data.shape

(80, 9)

# Pre-processing

#### Obs: o pré-processamento deve vir depois do processo de split, para evitar adicionar viés aos dados de teste por influência dos dados de treino.

In [23]:
def scale(feature):
    minimum = min(feature)
    maximum = max(feature)
    return feature.apply(lambda x: (x-minimum)/(maximum - minimum))

In [24]:
train_data_scaled = train_data.copy()
val_data_scaled = val_data.copy()
test_data_scaled = test_data.copy()
for feature in features:
    train_data_scaled[feature] = scale(train_data_scaled[feature])
    val_data_scaled[feature] = scale(val_data_scaled[feature])
    test_data_scaled[feature] = scale(test_data_scaled[feature])

### Mudando a escala dos dados para [0, 1]

Transformaremos [min, max] -> [0, 1] para cada coluna.

### Treinando o modelo

In [25]:
ws = list(np.zeros(len(features)))
b = 0
alpha = 0.1

In [26]:
model = Linear_Regression_Model(train_data = train_data_scaled, features_name = features,
                                        label_name = label, ws = ws, b = b, alpha = alpha,
                                        random_state = random_state)

In [27]:
model.get_loss(train_data_scaled)

0.5472704166666666

In [28]:
model.sgd(iterations = 100, batch_size = 10, print_loss = False)

In [29]:
model.get_loss(train_data_scaled)

0.004383113932743791

### Aplicando o modelo nos dados de validação

In [30]:
model.predict(val_data_scaled)

array([0.67223537, 0.71994552, 0.57703449, 0.8642598 , 0.55038929,
       0.75281217, 0.57208754, 0.69288086, 0.77013777, 0.83108974,
       0.73663218, 0.93538463, 0.5171173 , 0.45747614, 0.84232635,
       0.59042425, 0.74776962, 0.54636683, 0.6974251 , 0.70894246,
       0.64887908, 0.68022027, 0.78746653, 0.43330866, 0.71731504,
       0.65519802, 0.99003529, 0.56223016, 0.70666062, 0.59016448,
       0.54063628, 0.75822032, 0.67283915, 0.67120572, 0.82993046,
       0.85863443, 0.93464496, 0.75098249, 0.59998268, 0.56886454,
       0.53688601, 0.65974833, 0.88973079, 0.61666858, 0.39240887,
       0.87097884, 0.94901067, 0.63487231, 0.67318302, 0.51221046,
       0.88072423, 0.64219888, 0.74613155, 0.47881372, 0.76195412,
       0.57807943, 0.82870671, 0.68681615, 0.47580917, 0.65740421,
       0.83483313, 0.69463313, 0.53937576, 0.65337084, 0.66471428,
       0.69164067, 0.5139846 , 0.76414708, 0.85207456, 0.62851344,
       0.74873399, 0.97521924, 0.72596442, 0.64964309, 0.92647

In [31]:
model.get_loss(val_data_scaled)

0.006495367263401106

As perdas relativas à train_data e à val_data estão muito próximas e estão suficientemente baixas. Logo, falta somente conferirmos se o modelo não está overfitted por meio test_data.

### Aplicando o modelo nos dados de teste

In [32]:
model.predict(test_data_scaled)

array([0.68117038, 0.67957134, 0.76934072, 0.58186074, 0.68384716,
       0.57209048, 0.67254037, 0.65262099, 0.87710494, 0.94279276,
       0.46715719, 0.8829627 , 0.71960005, 0.42692906, 0.86106377,
       0.55515542, 0.59929645, 0.78818985, 0.57960767, 0.73811567,
       0.91228629, 0.87185809, 0.61166633, 0.39778826, 0.79484886,
       0.56547669, 0.46197687, 0.62519666, 0.91971258, 0.66298149,
       0.60967236, 0.73244591, 0.75620049, 0.4997315 , 0.77411228,
       0.75141912, 0.69052982, 0.86121499, 0.59657911, 0.94018597,
       0.72366849, 0.65538283, 0.7200752 , 0.81037571, 0.84870513,
       0.67510686, 0.54422428, 0.72358726, 0.55788679, 0.56348682,
       0.62955809, 0.77085326, 0.58750323, 0.90825927, 0.71980282,
       0.69889535, 0.77058978, 0.74013687, 0.77270239, 0.84723725,
       0.74667543, 0.38451819, 0.58689009, 0.49054155, 0.85567599,
       0.77771236, 0.68529016, 0.86444453, 0.73626412, 0.7121918 ,
       0.59534061, 0.83572709, 0.79122645, 0.59660675, 0.92635

In [33]:
model.get_loss(test_data_scaled)

0.005554168252244701

O resultado está bem coerente e configura uma possível predição. Assim, conseguimos treinar o modelo, validá-lo e testá-lo com sucesso!

### Parâmetros finais do modelo

In [34]:
model.print_parameters()

w1 = 0.11879119170129392
w2 = 0.12178581623485109
w3 = 0.03466165172927982
w4 = 0.08296755000680148
w5 = 0.12104512820940831
w6 = 0.16274519530202722
w7 = 0.021741583370602384
b = 0.34781088671437677
