## Regressão Linear

In [1]:
# IMPORTS #

import requests
import pandas as pd
import numpy as np

# plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

# machine learning libraries and functions
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# validação
from sklearn.model_selection import cross_val_score, cross_val_predict
import sklearn.metrics

In [2]:
url="https://raw.githubusercontent.com/Pedro-Magalhaes/Teste/master/T2/dataset1.csv"
df=pd.read_csv(url)
all_predictors = [ f'V{x}' for x in range(1,14) ] # [V1,V2...V13]
target = 'target'


### Usando lasso para avaliar as features
Fomos reduzindo o alpha para avaliar qual feature era usada pelo algoritmo

In [3]:
dfTrain, dfTest = train_test_split(df, test_size=0.1)

predictors =  all_predictors

X_train = np.array(dfTrain[predictors])
y_train = np.array(dfTrain[target])

X_test = np.array(dfTest[predictors])
y_test = np.array(dfTest[target])

# Create linear regression object
regr = sk.linear_model.Lasso(alpha=0.01,normalize=True)

# Train the model using the training sets
regr.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = regr.predict(X_test)

print('Predictors:       ', predictors)

# The coefficients
print('Coefficients:     ', regr.coef_)
print('Intercept:        ', regr.intercept_)

# The mean squared error (MSE)
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))

# Explained variance score: 1 is perfect prediction
print('Variance score:     %.2f' % r2_score(y_test, y_pred))

Predictors:        ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13']
Coefficients:      [ 1.91086557e-02  1.65957978e+00  3.95742297e+00 -8.75268098e-01
 -0.00000000e+00  6.69130073e-03 -4.19749334e-02 -2.41860302e-02
 -9.66749088e+00 -0.00000000e+00  0.00000000e+00 -8.66160144e-01
 -5.15340388e-01]
Intercept:         26.58824112487529
Mean squared error: 20.73
Variance score:     0.74


In [4]:
predictors = all_predictors

X = df[predictors]
y = df[target]
N_FOLDS = 4

scores = cross_val_score(regr, X, y, cv = N_FOLDS)
predicted = cross_val_predict(regr, X, y, cv = N_FOLDS)
sklearn.metrics.r2_score(y, predicted) 

print ('Cross-validated scores:', scores)

Cross-validated scores: [ 0.63149698  0.56681358  0.32056878 -1.19235166]


### Linear regression

In [5]:
dfTrain, dfTest = train_test_split(df, test_size=0.1)

predictors =  all_predictors

X_train = np.array(dfTrain[predictors])
y_train = np.array(dfTrain[target])

X_test = np.array(dfTest[predictors])
y_test = np.array(dfTest[target])

# Create linear regression object
regr = sk.linear_model.LinearRegression(normalize=True)

# Train the model using the training sets
regr.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = regr.predict(X_test)

print('Predictors:       ', predictors)

# The coefficients
print('Coefficients:     ', regr.coef_)
print('Intercept:        ', regr.intercept_)

# The mean squared error (MSE)
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))

# Explained variance score: 1 is perfect prediction
print('Variance score:     %.2f' % r2_score(y_test, y_pred))

print('Ajusted R^2')
SS_Residual = sum((y_test - y_pred)**2)
SS_Total = sum((y_test - np.mean(y_test))**2)
r_squared = 1 - (float(SS_Residual)) / SS_Total
adjusted_r_squared = 1 - (1 - r_squared) * (len(y_test)-1) / (len(y_test)-X_test.shape[1]-1)
print (r_squared, adjusted_r_squared)

Predictors:        ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13']
Coefficients:      [ 4.44385764e-02  3.01593135e+00  4.15613213e+00 -1.47734890e+00
 -1.21587270e-02  8.12158008e-03 -1.00392289e-01  3.64284811e-02
 -1.54826754e+01 -7.09720692e-03  2.69618413e-01 -9.31262875e-01
 -5.23914775e-01]
Intercept:         33.51819789274887
Mean squared error: 33.35
Variance score:     0.58
Ajusted R^2
0.5832798297867638 0.43686463484697824


In [6]:
predictors = all_predictors

X = df[predictors]
y = df[target]
N_FOLDS = 4

scores = cross_val_score(regr, X, y, cv = N_FOLDS)
predicted = cross_val_predict(regr, X, y, cv = N_FOLDS)
sklearn.metrics.r2_score(y, predicted) 

print ('Cross-validated scores:', scores)

Cross-validated scores: [ 0.60217169  0.60398145  0.35873597 -1.10867706]


### Ridge regression

In [7]:
dfTrain, dfTest = train_test_split(df, test_size=0.1)

predictors =  all_predictors

X_train = np.array(dfTrain[predictors])
y_train = np.array(dfTrain[target])

X_test = np.array(dfTest[predictors])
y_test = np.array(dfTest[target])

# Create linear regression object
regr = sk.linear_model.Ridge(normalize=True, alpha=0.8)

# Train the model using the training sets
regr.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = regr.predict(X_test)

print('Predictors:       ', predictors)

# The coefficients
print('Coefficients:     ', regr.coef_)
print('Intercept:        ', regr.intercept_)

# The mean squared error (MSE)
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))

# Explained variance score: 1 is perfect prediction
print('Variance score:     %.2f' % r2_score(y_test, y_pred))

Predictors:        ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13']
Coefficients:      [ 1.90687966e-02  2.45729774e+00  3.17002786e+00 -3.18600751e-01
 -2.68799780e-03  7.48371790e-03 -6.15067880e-02 -7.11300236e-02
 -4.38543883e+00 -7.79167678e-03  3.87086021e-03 -5.68037851e-01
 -2.81726818e-01]
Intercept:         19.879777794472982
Mean squared error: 20.17
Variance score:     0.70


In [8]:
predictors = all_predictors

X = df[predictors]
y = df[target]
N_FOLDS = 4

scores = cross_val_score(regr, X, y, cv = N_FOLDS)
predicted = cross_val_predict(regr, X, y, cv = N_FOLDS)
sklearn.metrics.r2_score(y, predicted) 

print ('Cross-validated scores:', scores)

Cross-validated scores: [ 0.56542862  0.50064298  0.28947706 -0.53952985]
