### Build a Neural Network model for 50_startups data to predict profit 

In [269]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score

In [247]:
df =pd.read_csv("G:\\Github\\DS-assignments-python\\Neural Network\\50_Startups.csv")

df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [248]:
df.shape

(50, 5)

In [249]:
df.head(5)


Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [250]:
# as state in categorical we need to make it numerical using dumdums

dum = pd.get_dummies(df["State"]).rename(columns = lambda x : "State_" + str(x))
dum.head(5)

Unnamed: 0,State_California,State_Florida,State_New York
0,0,0,1
1,1,0,0
2,0,1,0
3,0,0,1
4,0,1,0


In [251]:
#joining the dummy and dropping the redundant
df1 = pd.concat([dum,df],axis = 1)
df1.drop(["State"],axis=1,inplace=True)

In [252]:
df1.head(5)

Unnamed: 0,State_California,State_Florida,State_New York,R&D Spend,Administration,Marketing Spend,Profit
0,0,0,1,165349.2,136897.8,471784.1,192261.83
1,1,0,0,162597.7,151377.59,443898.53,191792.06
2,0,1,0,153441.51,101145.55,407934.54,191050.39
3,0,0,1,144372.41,118671.85,383199.62,182901.99
4,0,1,0,142107.34,91391.77,366168.42,166187.94


In [253]:
#df1.iloc[:,:6].columns.tolist()

In [254]:
#xy.fit_transform(df1.iloc[:,:6])

In [255]:
# we have stardize the values before putting it in the model

xy = StandardScaler()

#performing a deep copy
dfx = df1.copy()
dfx[df1.iloc[:,:6].columns.tolist()] = xy.fit_transform(df1.iloc[:,:6])
dfx.head(5)

Unnamed: 0,State_California,State_Florida,State_New York,R&D Spend,Administration,Marketing Spend,Profit
0,-0.717741,-0.685994,1.393261,2.016411,0.560753,2.153943,192261.83
1,1.393261,-0.685994,-0.717741,1.95586,1.082807,1.9236,191792.06
2,-0.717741,1.457738,-0.717741,1.754364,-0.728257,1.626528,191050.39
3,-0.717741,-0.685994,1.393261,1.554784,-0.096365,1.42221,182901.99
4,-0.717741,1.457738,-0.717741,1.504937,-1.079919,1.281528,166187.94


In [256]:
# we need to fit the model according to Xtrain....idk exactly why... i think it can done using ytest also or data of X also
X = dfx.iloc[:,:6]
Y = dfx.iloc[:,6]
tr_x,ts_x,tr_y,ts_y = train_test_split(X,Y, test_size = 0.2, random_state = 13)

In [257]:
#tr_x = tr_x.values
#ts_x = ts_x.values

In [258]:
tr_x[:5]

Unnamed: 0,State_California,State_Florida,State_New York,R&D Spend,Administration,Marketing Spend
27,-0.717741,-0.685994,1.393261,-0.035519,0.235069,1.174271
23,-0.717741,1.457738,-0.717741,-0.136201,-0.562211,0.774349
39,1.393261,-0.685994,-0.717741,-0.77382,-1.383122,-0.297583
17,-0.717741,-0.685994,1.393261,0.46072,0.855666,0.591017
15,-0.717741,-0.685994,1.393261,0.897913,0.045868,0.419219


In [259]:
########          applying NN
NN = MLPRegressor(hidden_layer_sizes=(15,15))

In [260]:
M1 = NN.fit(tr_x,tr_y)
pred_tr = M1.predict(tr_x)
pred_ts = M1.predict(ts_x)



In [261]:
#from sklearn.metrics import mean_squared_error
#rms = mean_squared_error(y_actual, y_predicted, squared=False)
#sklearn.metrics has a mean_squared_error function with a squared kwarg (defaults to True).
#Setting squared to False will return the RMSE.

In [262]:
##### train accuracy
print("Corr Coeffecient is ",np.corrcoef(tr_y,pred_tr)[0][1])
print("RMSE value is ",mse(tr_y,pred_tr,squared=False))


Corr Coeffecient is  0.5329017652775743
RMSE value is  117067.13699184047


In [263]:
###### test accuracy
print("Corr Coeffecient is ",np.corrcoef(ts_y,pred_ts)[0][1])
print("RMSE value is ",mse(ts_y,pred_ts,squared = False))

Corr Coeffecient is  0.5197119907341585
RMSE value is  125902.23041907547


-------------------------------------------------

#### Lil bit of paramter tuning

In [271]:
NN2 = MLPRegressor(hidden_layer_sizes=(150,100,50), max_iter=300,solver='adam',random_state=1)

In [275]:
M2 = NN2.fit(tr_x,tr_y)
pred_tr2 = NN2.predict(tr_x)
pred_ts2 = NN2.predict(ts_x)



In [276]:
##### train accuracy
print("Corr Coeffecient is ",np.corrcoef(tr_y,pred_tr2)[0][1])
print("RMSE value is ",mse(tr_y,pred_tr2,squared=False))


Corr Coeffecient is  0.865613443884024
RMSE value is  40495.44555405077


In [279]:
###### test accuracy
print("Corr Coeffecient is ",np.corrcoef(ts_y,pred_ts2)[0][1])
print("RMSE value is ",mse(ts_y,pred_ts2,squared = False))
print(r2_score(pred_ts2,ts_y))

Corr Coeffecient is  0.8169908134150529
RMSE value is  44351.10539135543
-0.11166926450337322
