# Scikit-learn Sheet

## Simple Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

x = data[['independent_variable']]
y = data['dependent_variable']

x_matrix = x.values.reshape(-1,1) #only necessary for 1 feature input 
reg = LinearRegression()
reg.fit(x_matrix,y)

#R-squared Values
reg.score(x_matrix,y)

#Coefficient of independent Variable
reg.coef_

#y-intercept
reg.intercept_

#### Predictions

In [None]:
reg.predict(prediction_df) #could input a single value or a df

## Multiple Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

x = data[['independent_variable_1','independent_variable_2']]
y = data['dependent_variable']

reg = LinearRegression()
reg.fit(x_matrix,y)

#R-squared Values
reg.score(x_matrix,y)

#Coefficients of independent Variables
reg.coef_ #output is array

#y-intercept
reg.intercept_

#### Adjusted R2

scikit learn does not have an inbuilt function to calculate adjusted-r2

$R^2_{adj.} = 1 - (1-R^2)*\frac{n-1}{n-p-1}$

Therefore we use below function

In [None]:
def adjusted_r2(independent_variables, dependent_variable):
    r2 = reg.score(independent_variables, dependent_variable)
    n = independent_variables.shape[0]
    p = independent_variables.shape[1]
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

## Feature Selection - F-regression

In [None]:
from sklearn.feature_selection import f_regression

p_values = f_regression(x,y)[1].round(3)

## Standardization

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x)
x_scaled = scaler.transform(x)

## remember to scale prediction data

## Train - Test Split

In [None]:
from sklearn.model_selection import train_test_split

x = data[['independent_variable_1','independent_variable_2']]
y = data['dependent_variable']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=365)