# Jupyter Notebook for Linear Regression

Short example to perform linear regression in python using sklearn using salaries data and dummy variables.

We show how to transform them using panda or sklearn.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import math
import seaborn as sn
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# let's upload the dataset
# data = pd.read_csv('../Dataset/Salaries.csv')   you should put the file inthe correct folder.
data.head()

In [None]:
# drop useless columns
data = data.drop('Unnamed: 0', axis = 1)
data.head()

In [None]:
# exploit categorical data
print(pd.unique(data['rank']))
print(pd.unique(data['discipline']))
print(pd.unique(data['sex']))

In [None]:
# seprate X and y
X = data.drop('salary', axis = 1)
y = data[['salary']]

we have three variables that can be transformed in dummies.

Let's start with the easy case: Discipline has only two variables and we use the LabelEncoder() form the preprocessing package

# Let's transofrm and use only one simple variable that can take only two values

In [None]:
# we can use pd.get_dummies or LabelEncoder
# 1. using get_dummies
dummies_pd = pd.get_dummies(data['discipline'])
dummies_pd.head()

In [None]:
# we can use pd.get_dummies or LabelEncoder
# 2. using LabelEncoder
le = LabelEncoder()
dummies_encoder = le.fit_transform(data[['discipline']])
dummies_encoder

We notice that dummies_encoder is the same as the column B.

We can just use the colum B of the matrix that we obtain using pd.get_dummies()

In [None]:
# Show what X.assign() does.

In [None]:
# using get dummies
X_get_dummies = X.assign(Discipline_dummy = dummies_pd[['B']])
X_get_dummies = X_get_dummies.drop('discipline', axis = 1)

# or we can use and then we build a data frame using it
X_get_dummies2 = np.hstack((X.drop('discipline', axis = 1), dummies_pd[['B']]))
X_get_dummies2_df = pd.DataFrame(X_get_dummies2, columns=['rank','yrs.since.phd', 'yrs.service',
                                                          'sex', 'discipline_dummy'])
X_get_dummies.head()

In [None]:
X_get_dummies2_df.head()

In [None]:
# using labelencoder
X_le = X.assign(Discipline_dummy = dummies_encoder)
X_le = X_le.drop('discipline', axis = 1)

# or we can use and then we build a data frame using it
X_le2 = np.hstack((X.drop('discipline', axis = 1), dummies_pd[['B']]))
X_le2_df = pd.DataFrame(X_get_dummies2, columns=['rank','yrs.since.phd', 'yrs.service','sex',
                                                         'discipline_dummy'])
X_le.head()

In [None]:
X_le2_df.head()

After this data transformation and manipulation let's fit the linea regresson model.

- 1. Let's split the sample in trainig test and test set
- 2. Fit the model
- 3. Predict using the model 
- 4. Evaluate the performances

I will use only X_get_dummies and X_le to show you we get the same

### First case: Let's use only the continous variables and the dummy we have created

In [None]:
# using get dummies  (we can use alternatively X_le)  
X_ex1 = X_get_dummies[['yrs.since.phd','yrs.service','Discipline_dummy']]
X_train, X_test, y_train,y_test = train_test_split(X_ex1, y,
                                                  test_size = 0.2,
                                                  random_state = 8)

In [None]:
# 2. fit the model
lm = LinearRegression()
lm.fit(X_train, y_train)

In [None]:
# 3. predict the model
y_pred = lm.predict(X_test)

index_name = ['intercept', 'yrs.since.phd','yrs.service','Discipline_dummy']

# coef 
coef = pd.DataFrame(np.append(lm.intercept_, lm.coef_), 
                    index_name, columns = ['coefficients'])

# 4. evaluate the model
print('MSE linear regression using only 3 variables: ', 
      mean_squared_error(y_test, y_pred))
print('RMSE linear regression using only 3 variables: ', 
      math.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 linear regression using only 3 variables: ', 
      r2_score(y_test, y_pred))
print(coef)

In [None]:
# using X_le (same results)
# 1. split sample
X_ex1 = X_le[['yrs.since.phd','yrs.service','Discipline_dummy']]
X_train, X_test, y_train,y_test = train_test_split(X_ex1, y,
                                                  test_size = 0.2,
                                                  random_state = 10)

# 2. fit the model
lm = LinearRegression()
lm.fit(X_train, y_train)

# 3. predict the model
y_pred = lm.predict(X_test)
index_name = ['intercept', 'yrs.since.phd','yrs.service','Discipline_dummy']

# coef 
coef = pd.DataFrame(np.append(lm.intercept_, lm.coef_), 
                    index_name, columns = ['coefficients'])
# 4. evaluate the model
print('MSE linear regression using only 3 variables: ', 
      mean_squared_error(y_test, y_pred))
print('RMSE linear regression using only 3 variables: ', 
      math.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 linear regression using only 3 variables: ', 
      r2_score(y_test, y_pred))
print(coef)

# Let's use all the variables, convert all the categorical ones and the fit the model

In [None]:
# let'use first pd.get_dummies
# NB remebr we always have to drop one of the dummy,
# to avoid multicollinearity

 pd.factorize(data['rank']) is different than get dummies Encode the object as an enumerated type or categorical variable.



In [None]:
# let's take the Original data set and let's look at it
X.head()
# we need to convert: rank, disicpline and sex

### a) using get_dummies

We will use the X matrix that is already a dataframe since now on

In [None]:
rank_d = pd.get_dummies(data['rank'])
discipline_d = pd.get_dummies(data['discipline'])
sex_d = pd.get_dummies(data['sex'])

In [None]:
print(rank_d.head())
print(discipline_d.head())
print(sex_d.head())
# we need to drop one dymmy for each category

In [None]:
# there are multiple ways to select only one column
# we review a little bit of indexing
# 1.
rank_dummy = rank_d.drop('AssocProf', axis = 1)
discipline_dummy = discipline_d.drop('A', axis = 1)
sex_dummy = sex_d.drop('Female', axis = 1)

2. 
rank_dummy2 = rank_d.iloc[:,1::]
discipline_dummy2 = discipline_d.iloc[:,1::]
sex_dummy2 = sex_d.iloc[:,1::]

print(rank_dummy)
print(rank_dummy2)

In [None]:
# Much easier way, get_dummies has thought about everything already

rank_d3 = pd.get_dummies(data['rank'], drop_first = True)
discipline_d3 = pd.get_dummies(data['discipline'], drop_first = True)
sex_d3 = pd.get_dummies(data['sex'], drop_first = True)

In [None]:
# we need to add the dummy to our dataframe
# we use pd.concat and the first one
X_get_dummies = pd.concat([X, rank_d3, discipline_d3, sex_d3],
                          axis = 1)
X_get_dummies.head()

In [None]:
# now we need to drop the categoriacal variable
X_get_dummies = X_get_dummies.drop(['rank', 'discipline','sex'],
                                   axis = 'columns')
X_get_dummies.head()

In [None]:
# 1. split the sample

X_train, X_test, y_train,y_test = train_test_split(X_get_dummies, y,
                                                  test_size = 0.2,
                                                  random_state = 10)

# 2. fit the model
lm = LinearRegression()
lm.fit(X_train, y_train)

# 3. predict the model
y_pred = lm.predict(X_test)
y_train_pred = lm.predict(X_train)

index_name = ['intercept']
for i in X_train.columns:
    index_name.append(i)

# coef 
coef = pd.DataFrame(np.append(lm.intercept_, lm.coef_), 
                    index_name, columns = ['coefficients'])

# 4. evaluate the model
print('MSE oos linear regression using all variables: ', 
      mean_squared_error(y_test, y_pred))
print('RMSE oos linear regression using all variables: ', 
      math.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 oos linear regression using all variables: ', 
      r2_score(y_test, y_pred))

print('MSE insample linear regression using all variables: ', 
      mean_squared_error(y_train, y_train_pred))
print('RMSE insample linear regression using all variables: ', 
      math.sqrt(mean_squared_error(y_train, y_train_pred)))
print('R2 insample linear regression using all variables: ', 
      r2_score(y_train, y_train_pred))

coef

### b) using LabelEncoder() and OneHotEncored()

In [None]:
le = LabelEncoder()

# disicipline and sex can be analized using le
discipline_d = le.fit_transform(data['discipline'])
sex_d = le.fit_transform(data['sex'])

# rank need one hot encoder
# 1. select unique categories
cat = pd.unique(data['rank']).ravel()

#2. define OneHotEncoders feattures, remebr to drop the first feature
#   to avoid collinearity. we don't need to do it in labelencoder
ohe = OneHotEncoder(sparse = False, categories = [cat])

rank_d = ohe.fit_transform(data[['rank']])
rank_d = np.asarray(rank_d[:,1::], dtype = int)   # convert into int.

#  create the new dataset, we want a dataframe so we have two choices:
# 1. crate a np.matrix and convert it into a DataFrame
X_le_tmp = np.hstack((data.values, rank_d, discipline_d.reshape(-1,1),
          sex_d.reshape(-1,1)))
names_col = ['rank', 'discipline', 'yrs.since.phd', 'yrs.service', 'sex',
            'salary', 'rank_d1', 'rank_d2', 'discipline_d', 'sex_d']
X_le_tmp = pd.DataFrame(X_le_tmp, columns = names_col)

# 2. convert dummies in dataframe and append them to the original data
rank_d = pd.DataFrame(rank_d, columns = ['rank_d1', 'rank_d2'])
discipline_d = pd.DataFrame(discipline_d, columns = ['disicpline_d'])
sex_d = pd.DataFrame(sex_d, columns = ['sex_d'])

X_le_tmp2 = pd.concat([data, rank_d, discipline_d, sex_d], axis = 1)

In [None]:
# we need to drop the original comlumns
# Let's remebr to drop also salary!!
X_le = X_le_tmp.drop(['rank','discipline','sex', 'salary'], axis = 1)
X_le2 = X_le_tmp2.drop(['rank','discipline','sex', 'salary'], axis = 1)

In [None]:
# let's fit the lineat model

# 1. split the sample
X_train, X_test, y_train,y_test = train_test_split(X_le, y,
                                                  test_size = 0.2,
                                                  random_state = 10)

# 2. fit the model
lm = LinearRegression()
lm.fit(X_train, y_train)

# 3. predict the model
y_pred = lm.predict(X_test)
y_trian_pred = lm.predict(X_train)
index_name = ['intercept']
for i in X_train.columns:
    index_name.append(i)

# coef 
coef = pd.DataFrame(np.append(lm.intercept_, lm.coef_), 
                    index_name, columns = ['coefficients'])

# 4. evaluate the model
print('MSE linear regression using all variables: ', 
      mean_squared_error(y_test, y_pred))
print('RMSE linear regression using all variables: ', 
      math.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 linear regression using only all variable: ', 
      r2_score(y_test, y_pred))

print('MSE insample linear regression using all variables: ', 
      mean_squared_error(y_train, y_train_pred))
print('RMSE insample linear regression using all variables: ', 
      math.sqrt(mean_squared_error(y_train, y_train_pred)))
print('R2 insample linear regression using all variables: ', 
      r2_score(y_train, y_train_pred))

coef