
# Simple linear regression 

## Import the relevant libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()

## Load the data

In [2]:
data= pd.read_csv('Multiple linear regression.csv')

In [3]:
data.head()

Unnamed: 0,SAT,"Rand 1,2,3",GPA
0,1714,1,2.4
1,1664,3,2.52
2,1760,3,2.54
3,1685,3,2.74
4,1693,2,2.83


In [4]:
# A statistical description of the data
data.describe()

Unnamed: 0,SAT,"Rand 1,2,3",GPA
count,84.0,84.0,84.0
mean,1845.27381,2.059524,3.330238
std,104.530661,0.855192,0.271617
min,1634.0,1.0,2.4
25%,1772.0,1.0,3.19
50%,1846.0,2.0,3.38
75%,1934.0,3.0,3.5025
max,2050.0,3.0,3.81


## Create the multiple linear regression

### Declare the dependent and the independent variables

In [9]:
# define dependent and independent variable
x= data[['SAT','Rand 1,2,3']]
y = data['GPA']

## Regression itself

In [12]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()

In [13]:
reg.fit(x,y)

LinearRegression()

Standardization: the process of subtracting the mean and dividing by the standard deviation(a type of normalization)
Normalization: has different meaning depending in the case; here - we subtract the mean but divide the L2-norm of the inputs
copy_x: if true;it copies the input before fitting them. It is a safety net against normalization and other transformation
n_jobs: is a parameter used when we want to parallelize routine

### Coefficients

In [14]:
reg.coef_

array([ 0.00165354, -0.00826982])

### Intercept

In [15]:
# A single regression always has a single intercept 
reg.intercept_

0.29603261264909486

### Calculating R-squared

In [16]:
# returns the R-squared of a linear regression
reg.score(x,y)

0.40668119528142843

We need a function/method that is not included in a package?
1. Google
2. Use our mad-math skills

### Formula for Adjusted R^2

$R^2_{adj.} = 1 - (1-R^2)*\frac{n-1}{n-p-1}$

n=84(the number of observations)


p=2(the number of predictors)

In [17]:
x.shape

(84, 2)

In [18]:
r2 = reg.score(x,y)
n = x.shape[0]
p = x.shape[1]

adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
adjusted_r2

0.39203134825134023

Adj. R^2<R^2, therefore one or more of the predictors have little or no explantory power

## Feature Selection with F-regression through p-values*

Feature selection simplifies models, improves speed and prevents a series of unwanted issues arising from having too many features

In [20]:
from sklearn.feature_selection import f_regression

In [21]:
f_regression(x,y)

(array([56.04804786,  0.17558437]), array([7.19951844e-11, 6.76291372e-01]))

In [22]:
p_values = f_regression(x,y)[1]
p_values

array([7.19951844e-11, 6.76291372e-01])

In [23]:
p_values.round(3)

array([0.   , 0.676])

Note: these are univariate p-values reached from simple linear models.
They do not reflext the interconnection of the features in our multiple linear regression

### Creating a summary table

In [24]:
reg_summary = pd.DataFrame(data=['SAT', 'Rand 1,2,3'], columns=['Features'])
reg_summary

Unnamed: 0,Features
0,SAT
1,"Rand 1,2,3"


In [25]:
reg_summary['Coefficients']=reg.coef_
reg_summary['p-values'] = p_values.round(3)

In [26]:
reg_summary

Unnamed: 0,Features,Coefficients,p-values
0,SAT,0.001654,0.0
1,"Rand 1,2,3",-0.00827,0.676


P-value are one of the best ways to determine if a variable is redundant 
but they provide no infrmation whasoever about How USEFUL a variable

## Feature selection through Standardization

Import the relevant libraries

In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.linear_model import LinearRegression

Load the data

In [31]:
data = pd.read_csv('Multiple linear regression.csv')
data.head()

Unnamed: 0,SAT,"Rand 1,2,3",GPA
0,1714,1,2.4
1,1664,3,2.52
2,1760,3,2.54
3,1685,3,2.74
4,1693,2,2.83


In [32]:
data.describe()

Unnamed: 0,SAT,"Rand 1,2,3",GPA
count,84.0,84.0,84.0
mean,1845.27381,2.059524,3.330238
std,104.530661,0.855192,0.271617
min,1634.0,1.0,2.4
25%,1772.0,1.0,3.19
50%,1846.0,2.0,3.38
75%,1934.0,3.0,3.5025
max,2050.0,3.0,3.81


### Create the multiple linear regression

Declare the dependent and independent variables

In [34]:
x = data[['SAT', 'Rand 1,2,3']]
y = data['GPA']

Standardization

In [35]:
from sklearn.preprocessing import StandardScaler

In [36]:
# we have created an empty StandardScaler object
# Scaler will be used to subtract the mean and divide by standard deviation
scaler = StandardScaler()

In [37]:
#fit calculates and stores the mean and standard deviation of each feature
scaler.fit(x)

StandardScaler()

In [38]:
x_scaled = scaler.transform(x)

In [39]:
x_scaled = scaler.transform(x)

# Regression with scaled features

In [40]:
reg = LinearRegression()
reg.fit(x_scaled,y)

LinearRegression()

In [41]:
reg.intercept_

3.330238095238095

# Creating a summary table

In [44]:
reg_summary = pd.DataFrame([['Intercept'],['SAT'],['Rand 1,2,3']], columns=['Features'])
reg_summary['weights'] = [reg.intercept_, reg.intercept_,reg.coef_[0], reg.coef_[1]]

ValueError: Length of values (4) does not match length of index (3)

In [None]:
# The bigger the weight the bigger the impact
# The 'ML word' for intercept is bias
# The closer a weight is to 0, the smaller its impact
# The bigger the weight the bigger 
reg_summary

In [None]:
# Making predictions with the standardized coefficient

In [None]:
new_data = pd.DataFrame(data = [[1700,2],[1800,1]], columns=['SAT','Rand 1,2,3'])
new_data

In [None]:
# THis prediction looks nothing like a GPA score
reg.predict(new_data)

THe new data frame should be arranged in the same way...
and also must be standardized in the same way

In [None]:
new_data_scaled = scaler.transform(new_data)
new_data_scaled

In [None]:
# THat looks more like a GPA score
reg.predict(new_data_scaled)

#### What if we remove the 'Random 1,2,3' variable?
Theory suggests that nothing should chage right?

In [None]:
# create regression
reg_simple = LinearRegression()
# decleare input
x_simple_matrix = x_scaled[:,0].reshape(-1,1)
#fit the regression with input
reg.simple.fit(x_simple_matrix,y)

In [None]:
# It's crutial We only feed the SAT score because this regression was trained only on SAT
reg_simple.predict(new_data_scaled[:,0].reshape(-1,1))

# Training and testing*


### Train Test Split
We TRAIN the model on the training dataset
but then TEST it on the testing dataset

#### Import the relevant libraries

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

#### Generate some data we are going to split                                               

In [None]:
# returns evenly spaced values withing a given interval. By default the output is an ndarray
a = np.arange(1,101)

In [None]:
a

In [None]:
b = np.arange(501,601)
b

#### Split the data

In [None]:
# train_test_split(x) splits arrays or matrices into random train and test subsets
train_test_split(a)

In [None]:
a_train, a_test, b_train, b_test = train_test_split(a, b, test_size =0.2, random_state=365)

In [None]:
#### Explore the result 

In [None]:
a_train.shape, a_test.shape

In [None]:
a_train

In [None]:
a_test

In [None]:
b_train.shape, b_test.shape

In [None]:
b_train

In [None]:
b_test