## Multiple Linear Regression  Assignment
### Import Libraries 

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from pandas.core.common import random_state
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

## import & explore dataset

In [38]:
df =pd.read_csv("multiple_linear_regression_dataset.csv")

In [39]:
df.head(3)

Unnamed: 0,age,experience,income
0,25,1,30450
1,30,3,35670
2,47,2,31580


In [40]:
df.tail(3)

Unnamed: 0,age,experience,income
17,23,1,30870
18,44,9,44190
19,37,10,48700


In [41]:
df.sample(6)

Unnamed: 0,age,experience,income
1,30,3,35670
0,25,1,30450
7,33,4,37650
16,58,17,63600
2,47,2,31580
18,44,9,44190


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   age         20 non-null     int64
 1   experience  20 non-null     int64
 2   income      20 non-null     int64
dtypes: int64(3)
memory usage: 612.0 bytes


In [43]:
df

Unnamed: 0,age,experience,income
0,25,1,30450
1,30,3,35670
2,47,2,31580
3,32,5,40130
4,43,10,47830
5,51,7,41630
6,28,5,41340
7,33,4,37650
8,37,5,40250
9,39,8,45150


### statistical Analysis

In [44]:
df.describe()

Unnamed: 0,age,experience,income
count,20.0,20.0,20.0
mean,39.65,6.2,40735.5
std,10.027725,4.124382,8439.797625
min,23.0,1.0,27840.0
25%,31.5,3.75,35452.5
50%,40.0,5.0,40190.0
75%,47.0,9.0,45390.0
max,58.0,17.0,63600.0


## split feature dataset & target column

In [45]:
feature =df.drop("income",axis="columns")
target = df.income

In [46]:
feature

Unnamed: 0,age,experience
0,25,1
1,30,3
2,47,2
3,32,5
4,43,10
5,51,7
6,28,5
7,33,4
8,37,5
9,39,8


In [47]:
target

0     30450
1     35670
2     31580
3     40130
4     47830
5     41630
6     41340
7     37650
8     40250
9     45150
10    27840
11    46110
12    36720
13    34800
14    51300
15    38900
16    63600
17    30870
18    44190
19    48700
Name: income, dtype: int64

## visualizing data

In [48]:
import plotly.express as px
import plotly.graph_objects as go
fig = px.scatter_3d(feature, x=feature.age,y=feature.experience,z=target,)
fig.show()

## Scaling the data

In [49]:
scaler = StandardScaler()
scaler.fit(feature)
feature = scaler.transform(feature)
feature

array([[-1.49890262, -1.29354835],
       [-0.98733176, -0.79602975],
       [ 0.75200916, -1.04478905],
       [-0.78270342, -0.29851116],
       [ 0.34275248,  0.94528533],
       [ 1.16126585,  0.19900744],
       [-1.1919601 , -0.29851116],
       [-0.68038924, -0.54727045],
       [-0.27113256, -0.29851116],
       [-0.06650421,  0.44776674],
       [-1.08964593, -1.29354835],
       [ 0.75200916,  0.69652603],
       [ 1.46820837, -0.29851116],
       [ 1.16126585, -0.54727045],
       [ 0.44506665,  1.44280393],
       [ 0.13812413, -0.04975186],
       [ 1.87746506,  2.68660041],
       [-1.70353096, -1.29354835],
       [ 0.44506665,  0.69652603],
       [-0.27113256,  0.94528533]])

## splitting the data

In [50]:
X_train , X_test , y_train , y_test = train_test_split( feature , target , test_size = 0.2, random_state = 42)

In [51]:
X_test 

array([[-1.49890262, -1.29354835],
       [-1.70353096, -1.29354835],
       [ 0.13812413, -0.04975186],
       [-0.98733176, -0.79602975]])

In [52]:
y_test

0     30450
17    30870
15    38900
1     35670
Name: income, dtype: int64

## build model

In [53]:
reg = LinearRegression()
reg.fit(X_train , y_train)

# predict results

In [54]:
y_pred = reg.predict(X_test)
y_pred_train = reg.predict(X_train)

In [55]:
y_pred

array([31093.38107376, 31295.49954076, 40250.46080162, 34897.6958918 ])

In [56]:
y_pred_train

array([38499.89224284, 41394.67395938, 46108.52187893, 39005.18841035,
       46411.69957944, 62235.31425262, 34930.25748106, 31024.88342951,
       44762.19025417, 49273.91970672, 48667.56430571, 36781.88527333,
       36749.32368407, 30689.14413975, 52876.11605776, 39409.42534435])

## evaluating the results

### evaluating the model

In [57]:
reg.intercept_

40817.851865396566

In [58]:
reg.coef_

array([-987.73446086, 8662.21087604])

In [59]:
MSE = mean_squared_error(y_test , y_pred)
print( 'MSE = ' , MSE)

MSE =  753796.7693734562


In [60]:
r2 = r2_score(y_test ,y_pred)
accuracy = int(r2*100)
print( 'accuracy =', accuracy, '%')

accuracy = 93 %


# visualization of results

### trainning data predictions

In [66]:
fig = px.scatter_3d(x=X_train[:, 0], y=X_train[:, 1], z=y_pred_train)

fig.show()

### test data predictions

In [63]:
fig = px.scatter_3d(x=X_test[:, 0], y=X_test[:, 1], z=y_pred)

fig.show()