# ML - SL - Linear Regression Algorithm continue ...
# One hot encoding with Dummy variables

In [26]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [27]:
data1  = pd.read_csv("GPA_data.csv")
data1.head()

Unnamed: 0,ID,Gender,Age,Extra_Curricular,Study_Hours,Annual_Income,Distance_From_Home,GPA
0,8867,Male,22,Societies,10,1318792,154,1.76
1,1316,Female,24,Societies,9,874657,181,3.88
2,9443,Male,22,Societies,15,1098331,94,1.73
3,1829,Female,25,No,9,1680585,8,1.29
4,1989,Male,26,No,7,1361560,122,3.19


Here ID column can't be got as a variable becase it is not changing as a variabble.

There are some categorical variable we have to convert(one hot encode) them to dummy, but there are some machine learnig model that doesn't need to one hot encode categorical variable

In [28]:
data1["Gender"].value_counts()

Gender
Male      500
Female    500
Name: count, dtype: int64

In [29]:
data1["Extra_Curricular"].value_counts()

Extra_Curricular
Societies    353
Sports       336
No           311
Name: count, dtype: int64

### Subset 

In [30]:
data1[["Gender" , "Extra_Curricular"]]

Unnamed: 0,Gender,Extra_Curricular
0,Male,Societies
1,Female,Societies
2,Male,Societies
3,Female,No
4,Male,No
...,...,...
995,Female,No
996,Male,No
997,Male,Sports
998,Male,Societies


### Creating dummy variables

In [31]:
dfd = pd.get_dummies(data1[["Gender", "Extra_Curricular"]])
dfd

Unnamed: 0,Gender_Female,Gender_Male,Extra_Curricular_No,Extra_Curricular_Societies,Extra_Curricular_Sports
0,False,True,False,True,False
1,True,False,False,True,False
2,False,True,False,True,False
3,True,False,True,False,False
4,False,True,True,False,False
...,...,...,...,...,...
995,True,False,True,False,False
996,False,True,True,False,False
997,False,True,False,False,True
998,False,True,False,True,False


### Dropping Columns

We can do this without removing columns but When we come to Descriptive statistics/conventional statistics/predictive model there are some issues. Because model should be very interpretable. Therefore definitely we remove them

In the Explainable Machine Learning under the interpretation and prediction accuracy we have a concept called  multi codinarity.

All columns are depending

In [32]:
dfd.drop(["Gender_Female", "Extra_Curricular_No"], axis =1, inplace=True)   #Can't run twise or more
dfd

Unnamed: 0,Gender_Male,Extra_Curricular_Societies,Extra_Curricular_Sports
0,True,True,False
1,False,True,False
2,True,True,False
3,False,False,False
4,True,False,False
...,...,...,...
995,False,False,False
996,True,False,False
997,True,False,True
998,True,True,False


### Concatenating our dummy variables to original and remove the " Gender " and " Extra_Curricular "  and " ID " columns

In [33]:
data_new = pd.concat([data1, dfd], axis=1)    # New data set
data_new.head()

Unnamed: 0,ID,Gender,Age,Extra_Curricular,Study_Hours,Annual_Income,Distance_From_Home,GPA,Gender_Male,Extra_Curricular_Societies,Extra_Curricular_Sports
0,8867,Male,22,Societies,10,1318792,154,1.76,True,True,False
1,1316,Female,24,Societies,9,874657,181,3.88,False,True,False
2,9443,Male,22,Societies,15,1098331,94,1.73,True,True,False
3,1829,Female,25,No,9,1680585,8,1.29,False,False,False
4,1989,Male,26,No,7,1361560,122,3.19,True,False,False


In [34]:
data_new.drop(["Gender", "Extra_Curricular","ID"], axis=1, inplace = True)
data_new

Unnamed: 0,Age,Study_Hours,Annual_Income,Distance_From_Home,GPA,Gender_Male,Extra_Curricular_Societies,Extra_Curricular_Sports
0,22,10,1318792,154,1.76,True,True,False
1,24,9,874657,181,3.88,False,True,False
2,22,15,1098331,94,1.73,True,True,False
3,25,9,1680585,8,1.29,False,False,False
4,26,7,1361560,122,3.19,True,False,False
...,...,...,...,...,...,...,...,...
995,22,5,1675219,27,2.32,False,False,False
996,24,8,947439,125,2.70,True,False,False
997,26,6,1912975,67,1.36,True,False,True
998,22,12,341543,86,1.88,True,True,False


### Changing columns order

get list of column names using " data_new.columns "

In [36]:
data_new = data_new.reindex(columns = ['Age', 'Study_Hours', 'Annual_Income', 'Distance_From_Home', 'Gender_Male', 
                                       'Extra_Curricular_Societies', 'Extra_Curricular_Sports', 'GPA'])
data_new

Unnamed: 0,Age,Study_Hours,Annual_Income,Distance_From_Home,Gender_Male,Extra_Curricular_Societies,Extra_Curricular_Sports,GPA
0,22,10,1318792,154,True,True,False,1.76
1,24,9,874657,181,False,True,False,3.88
2,22,15,1098331,94,True,True,False,1.73
3,25,9,1680585,8,False,False,False,1.29
4,26,7,1361560,122,True,False,False,3.19
...,...,...,...,...,...,...,...,...
995,22,5,1675219,27,False,False,False,2.32
996,24,8,947439,125,True,False,False,2.70
997,26,6,1912975,67,True,False,True,1.36
998,22,12,341543,86,True,True,False,1.88


### Model fitting

In [39]:
x = data_new.iloc[:,:7].values
y = data_new.iloc[:,7].values
model = LinearRegression()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
MSE = mean_squared_error(y_predict, y_test)
RMSE = np.sqrt(MSE)
print("Intercept = ",model.intercept_)   # c of y= mx+c , beta note
print("\nCoef = ",model.coef_)   # 7 coefficients must be for 7 x variables, b1 hat(^), b2^, b3^,....b7^
print("\nR Squred value = ",model.score(x_train, y_train))   # R square value good --->1
print("\ny predict \n",y_predict)
print("\nMean Squared Error = ",MSE)
print("\nRoot Mean Squared Error = ",RMSE)

Intercept =  2.2647935280481137

Coef =  [ 1.12426507e-02 -1.03471765e-02  9.62872140e-08 -2.21608112e-04
 -5.12717033e-02  2.65567024e-02  4.36877818e-02]

R Squred value =  0.007164234631799715

y predict 
 [2.45859544 2.48189844 2.52609814 2.44755148 2.43441844 2.4441942
 2.59304194 2.5650934  2.45743111 2.54705058 2.41369892 2.48703529
 2.54110891 2.55756711 2.40964999 2.6464932  2.55155379 2.49226051
 2.61246241 2.58476053 2.48632814 2.62401499 2.40425402 2.54747027
 2.52877333 2.53210703 2.58689872 2.42520324 2.5997469  2.51215648
 2.55717346 2.57209611 2.41712661 2.51984043 2.48485112 2.53237746
 2.47437727 2.52478684 2.62616758 2.53023872 2.60300406 2.46409328
 2.44288531 2.52653268 2.54175963 2.43226457 2.38444236 2.54421868
 2.39457219 2.45051988 2.45748266 2.54048337 2.39321516 2.45221297
 2.54202793 2.60474351 2.37836383 2.60193504 2.64836809 2.47970387
 2.60829197 2.43343052 2.46820314 2.43089338 2.48106334 2.54937679
 2.5677143  2.50048765 2.50381686 2.59987949 2.54103206

### let check something

In [40]:
x_check = np.array([24,5,20000,300,1,0,1]).reshape(1,7)   #Alaways give a row metrix
x_check_predict = model.predict(x_check)
print("Prediction of x_check = ",x_check_predict)

Prediction of x_check =  [2.41074065]


# Full code

In [45]:
# Multiple Linear Regression using Dummy variable with Train Test Split

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

data1  = pd.read_csv("GPA_data.csv")
print("Head of GPA_data set : \n\n",data1.head())

data1[["Gender" , "Extra_Curricular"]]

dfd = pd.get_dummies(data1[["Gender", "Extra_Curricular"]])
print("\nDummies : \n\n",dfd)

dfd.drop(["Gender_Female", "Extra_Curricular_No"], axis =1, inplace=True)
print("\nAfter dropped Gender_Female & Extra_Curricular_No : \n\n",dfd)

data_new = pd.concat([data1, dfd], axis=1)
print("\nAfter concatenating Dummies to Original : \n\n",data_new.head())

data_new.drop(["Gender", "Extra_Curricular","ID"], axis=1, inplace = True)
print("\nDropped unwanted previous Gender & Extra_Curricular & ID : \n\n",data_new)

data_new = data_new.reindex(columns = ['Age', 'Study_Hours', 'Annual_Income', 'Distance_From_Home', 'Gender_Male', 'Extra_Curricular_Societies', 'Extra_Curricular_Sports', 'GPA'])
print("\nLets use this new Data set : \n\n",data_new.head())

x = data_new.iloc[:,:7].values
y = data_new.iloc[:,7].values
model = LinearRegression()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
MSE = mean_squared_error(y_predict, y_test)
RMSE = np.sqrt(MSE)
print("\nIntercept = ",model.intercept_)
print("\nCoef = ",model.coef_)
print("\nR Score value = ",model.score(x_train, y_train))
print("\ny predict \n",y_predict)
print("\nMean Squared Error = ",MSE)
print("\nRoot Mean Squared Error = ",RMSE)

x_check = np.array([24,5,20000,300,1,0,1]).reshape(1,7)
x_check_predict = model.predict(x_check)
print("\nx_check : ", x_check)
print("\nPredicted \"GPA\" for x_check = ",x_check_predict)

Head of GPA_data set : 

      ID  Gender  Age Extra_Curricular  Study_Hours  Annual_Income  \
0  8867    Male   22        Societies           10        1318792   
1  1316  Female   24        Societies            9         874657   
2  9443    Male   22        Societies           15        1098331   
3  1829  Female   25               No            9        1680585   
4  1989    Male   26               No            7        1361560   

   Distance_From_Home   GPA  
0                 154  1.76  
1                 181  3.88  
2                  94  1.73  
3                   8  1.29  
4                 122  3.19  

Dummies : 

      Gender_Female  Gender_Male  Extra_Curricular_No  \
0            False         True                False   
1             True        False                False   
2            False         True                False   
3             True        False                 True   
4            False         True                 True   
..             ...          .