# Lesson 1.6

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# generating an array of x values
x = np.arange(0,10,1)
x

In [None]:
y = 0 + 1 * x
y2 = 3 + 1 * x 
y3 = -1 + 1 * x
#here we are changing the y-intercept, which controls the height of the line

plt.plot(x,y)
plt.plot(x,y2)
plt.plot(x,y3)
plt.show()

In [None]:
y = 0 + x 
y2 = 0 + 2*x 
y3 = 0 + -3*x
#here we are changing the gradient (aka coefficient), which controls the slope of the line

plt.plot(x,y)
plt.plot(x,y2)
plt.plot(x,y3)
plt.show()

In [None]:
#The whole goal of Linear Regression is to find the "best" intercept and slope to "fit" our data!


In [None]:
# apply linear regression on the following data

In [None]:
reg_data = pd.read_csv('regression_data.csv')
reg_data.head()

In [None]:
reg_data.describe().T

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
#let's pick the AVGGIFT variable as the one we think is the most explanatory
sns.scatterplot(x='AVGGIFT',y='TARGET_D',data=reg_data)
plt.show()

In [None]:
reg_data['TARGET_D'].corr(reg_data['AVGGIFT'])

In [None]:
y = reg_data['TARGET_D']
X = reg_data[['AVGGIFT']]

In [None]:
type(X)

In [None]:
# to perform the linear regression we'll use the sklearn implementation of linear regression
# we will use sklearn a lot

# we first create the model. This just tells python to be ready to use a linear model, it does not actually compute anything yet

lm = linear_model.LinearRegression()

In [None]:
%%time
# we now "fit" (a.k.a. "train") the model in our data
# linear regression picks the line (i.e. the intercept and the gradient) that best "fits" our data
# we will get to the meaning of "fitting the data" in a second

lm.fit(X,y)
print(lm.intercept_)
print(lm.coef_) # coefficients is an array because later we will see we can have more than one dimension for our gradient

In [None]:
#let's visualize this result
regression_line = lm.intercept_ + lm.coef_[0]*reg_data['AVGGIFT']
plt.plot(reg_data['AVGGIFT'], regression_line, c = 'orange')
sns.scatterplot(x='AVGGIFT',y='TARGET_D',data=reg_data)
plt.show()

In [None]:
#we can use this model to predict new or unseen datapoints
lm.predict([[10],[20],[35],[55]])
#what does this mean

In [None]:
#the score is not great, but we have not performed any data preparation yet
print("R2-score is ", lm.score(X,y))

y_pred = lm.predict(X)
print("mean squared error (MSE) is ", mean_squared_error(y_pred,y))
np.sqrt(mean_squared_error(y_pred,y))

In [None]:
#any other line would have a worse result
regression_line = lm.intercept_ + lm.coef_[0]*reg_data['AVGGIFT']
plt.plot(reg_data['AVGGIFT'], regression_line, c = 'orange')
regression_line_2 = lm.intercept_ + 0.8*reg_data['AVGGIFT']
plt.plot(reg_data['AVGGIFT'], regression_line_2, c = 'red')
regression_line_3 = 10 + lm.coef_[0]*reg_data['AVGGIFT']
plt.plot(reg_data['AVGGIFT'], regression_line_3, c = 'green')
sns.scatterplot(x='AVGGIFT',y='TARGET_D',data=reg_data)
plt.show()

In [None]:
print("MSE fitted line is ", round(mean_squared_error(regression_line, y),2))
print("MSE line 2 is      ", round(mean_squared_error(regression_line_2, y),2))
print("MSE line 3 is      ", round(mean_squared_error(regression_line_3, y),2))

In [None]:
%%time
#let's bring in more variables
y = reg_data['TARGET_D']
X2 = reg_data.drop(['TARGET_D'], axis=1)
lm2 = linear_model.LinearRegression()
lm2.fit(X2,y)
print(lm2.score(X2,y))
y_pred = lm2.predict(X2)
print(mean_squared_error(y_pred,y))
#results are better, but now we cannot visualize them

In [None]:
print(lm2.intercept_)
print(lm2.coef_)

# back to presentation-->

# Checking assumptions

In [None]:
# THIS IS PROBABLY THE MOST IMPORTANT LINE IN THIS NOTEBOOK
# the best variables are those that have a high correlation with the target (you want to predict, always the y), but low correlation between themselves

In [None]:
correlations_matrix = reg_data.corr()
sns.heatmap(correlations_matrix, annot=True)
plt.show()

In [None]:
# We can see that there is a very strong positive correlation between IC1 and IC2, IC2 and IC3, IC3 and IC4
# using the concept of multicollinearity, lets drop IC1, IC3 and IC4 and keep IC2 as it has the highest corr wit the target

In [None]:
reduced_data = reg_data.drop(['IC1', 'IC3', 'IC2'],axis=1)
reduced_data.columns

In [None]:
from sklearn import linear_model
y = reduced_data['TARGET_D']
X3 = reduced_data.drop(['TARGET_D'], axis=1)
lm3 = linear_model.LinearRegression()
lm3.fit(X3,y)
print(lm3.score(X3,y))
y_pred=lm3.predict(X3)
print(mean_squared_error(y_pred,y))