# Multiple Linear Regression
* Uses more than one independent variable for prediction.<br>
_The example below is a very simple multiple regression problem._

### [Dummy variable trap](https://medium.com/datadriveninvestor/dummy-variable-trap-c6d4a387f10a)

In [14]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import math

data = pd.read_csv('houses_price_citywise.csv')
print(data)

#converting categorical values into dummy/indicator values
d1=pd.get_dummies(data['City'],drop_first=True)
print(d1)

    rooms  Area        City   Price
0       5   400  Chandigarh  101.50
1       3   300       Delhi   79.90
2       5   375       Delhi   99.87
3       3   200  Chandigarh   56.90
4       2   150       Delhi   66.60
5       5   450     Patiala  105.45
6       5   300  Chandigarh  126.30
7       4   300     Patiala   89.25
8       5   300     Patiala   99.97
9       3   200  Chandigarh   87.60
10      4   300     Patiala  112.60
11      3   200       Delhi   85.60
12      3   200  Chandigarh   78.50
13      2   160     Patiala   74.30
14      2   150  Chandigarh   74.80
    Delhi  Patiala
0       0        0
1       1        0
2       1        0
3       0        0
4       1        0
5       0        1
6       0        0
7       0        1
8       0        1
9       0        0
10      0        1
11      1        0
12      0        0
13      0        1
14      0        0


In [15]:
#adding indicators to dataset
data = pd.concat([d1,data],axis=1)
print(data)

#removing dummy data
data = data.drop('City',axis=1)
print(data)

from sklearn.model_selection import train_test_split
model = LinearRegression()
X= data.iloc[:,:-1]
Y = data.iloc[:,4]
print(X)
print(Y)
x_train,x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,random_state=1)

model.fit(x_train, y_train)
pred= model.predict(x_test)
print(pred)

    Delhi  Patiala  rooms  Area        City   Price
0       0        0      5   400  Chandigarh  101.50
1       1        0      3   300       Delhi   79.90
2       1        0      5   375       Delhi   99.87
3       0        0      3   200  Chandigarh   56.90
4       1        0      2   150       Delhi   66.60
5       0        1      5   450     Patiala  105.45
6       0        0      5   300  Chandigarh  126.30
7       0        1      4   300     Patiala   89.25
8       0        1      5   300     Patiala   99.97
9       0        0      3   200  Chandigarh   87.60
10      0        1      4   300     Patiala  112.60
11      1        0      3   200       Delhi   85.60
12      0        0      3   200  Chandigarh   78.50
13      0        1      2   160     Patiala   74.30
14      0        0      2   150  Chandigarh   74.80
    Delhi  Patiala  rooms  Area   Price
0       0        0      5   400  101.50
1       1        0      3   300   79.90
2       1        0      5   375   99.87
3       

In [16]:
from sklearn.metrics import r2_score
score = r2_score(y_test, pred)
print(score)

0.43669347275875026


### Replacing NaN values

In [21]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import math

data = pd.read_csv('houses_price_3_Columns.csv')
print(data)
print()
print(math.floor(data.Area.median()))   #replace value for NaN 

model = LinearRegression()
m_value = math.floor(data.Area.median())
data.Area = data.Area.fillna(m_value)  #replacing NaN with floor of "Area" mean
print("Traning data :\n",data)
model.fit(data[['rooms','Area']],data.Price)

data1= pd.read_csv('houses_input_data.csv')
print("Test data:\n",data1)
pred = np.round(model.predict(data1),2)
print("Predicted prices for test data:\n",pred)

#adding predictions to test data
data1['prices'] = pred
print(data1)
data1.to_csv('output.csv',index=False)

    rooms   Area   Price
0       5  400.0  101.50
1       3  300.0   79.90
2       5  375.0   99.87
3       3  200.0   56.90
4       2  150.0   66.60
5       5  450.0  105.45
6       5  300.0  126.30
7       4  300.0   89.25
8       5    NaN   99.97
9       3  200.0   87.60
10      4  300.0  112.60
11      3  200.0   85.60
12      3  200.0   78.50
13      2  160.0   74.30
14      2  150.0   74.80

250
Traning data :
     rooms   Area   Price
0       5  400.0  101.50
1       3  300.0   79.90
2       5  375.0   99.87
3       3  200.0   56.90
4       2  150.0   66.60
5       5  450.0  105.45
6       5  300.0  126.30
7       4  300.0   89.25
8       5  250.0   99.97
9       3  200.0   87.60
10      4  300.0  112.60
11      3  200.0   85.60
12      3  200.0   78.50
13      2  160.0   74.30
14      2  150.0   74.80
Test data:
    rooms  Area
0      5   400
1      3   300
2      5   375
3      3   200
4      2   150
5      5   450
6      5   300
7      4   300
8      5   250
9      3   200
Pr