In [1]:
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Dropout
import io
import os
from sklearn import metrics
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
#import a dataset
df = pd.read_csv(r"C:\Users\HP\Desktop\deep data\jh-simple-dataset.csv", na_values=["NA", "?"])
display(df)

Unnamed: 0,id,job,area,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,retail_dense,crime,product
0,1,vv,c,50876.0,13.100000,1,9.017895,35,11.738935,49,0.885827,0.492126,0.071100,b
1,2,kd,c,60369.0,18.625000,2,7.766643,59,6.805396,51,0.874016,0.342520,0.400809,c
2,3,pe,c,55126.0,34.766667,1,3.632069,6,13.671772,44,0.944882,0.724409,0.207723,b
3,4,11,c,51690.0,15.808333,1,5.372942,16,4.333286,50,0.889764,0.444882,0.361216,b
4,5,kl,d,28347.0,40.941667,3,3.822477,20,5.967121,38,0.744094,0.661417,0.068033,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1996,vv,c,51017.0,38.233333,1,5.454545,34,14.013489,41,0.881890,0.744094,0.104838,b
1996,1997,kl,d,26576.0,33.358333,2,3.632069,20,8.380497,38,0.944882,0.877953,0.063851,a
1997,1998,kl,d,28595.0,39.425000,3,7.168218,99,4.626950,36,0.759843,0.744094,0.098703,f
1998,1999,qp,c,67949.0,5.733333,0,8.936292,26,3.281439,46,0.909449,0.598425,0.117803,c


In [3]:
#checking for dataset details
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              2000 non-null   int64  
 1   job             2000 non-null   object 
 2   area            2000 non-null   object 
 3   income          1941 non-null   float64
 4   aspect          2000 non-null   float64
 5   subscriptions   2000 non-null   int64  
 6   dist_healthy    2000 non-null   float64
 7   save_rate       2000 non-null   int64  
 8   dist_unhealthy  2000 non-null   float64
 9   age             2000 non-null   int64  
 10  pop_dense       2000 non-null   float64
 11  retail_dense    2000 non-null   float64
 12  crime           2000 non-null   float64
 13  product         2000 non-null   object 
dtypes: float64(7), int64(4), object(3)
memory usage: 218.9+ KB


In [4]:
#checking for missing data
df.isnull().sum()

id                 0
job                0
area               0
income            59
aspect             0
subscriptions      0
dist_healthy       0
save_rate          0
dist_unhealthy     0
age                0
pop_dense          0
retail_dense       0
crime              0
product            0
dtype: int64

In [5]:
df = pd.concat([df,pd.get_dummies(df["job"],dtype=int,prefix="job")],axis=1)
df.drop('job', axis=1, inplace=True)

# Generating dummies for area
df = pd.concat([df,pd.get_dummies(df['area'],dtype=int,prefix="area")],axis=1)
df.drop('area', axis=1, inplace=True)

# Generating dummies for product
df = pd.concat([df,pd.get_dummies(df['product'], dtype=int, prefix="product")],axis=1)
df.drop('product', axis=1, inplace=True)

# filling the missing values for income using median
df['income'] = df['income'].fillna(df['income'].median())

In [6]:
display(df)

Unnamed: 0,id,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,retail_dense,...,area_b,area_c,area_d,product_a,product_b,product_c,product_d,product_e,product_f,product_g
0,1,50876.0,13.100000,1,9.017895,35,11.738935,49,0.885827,0.492126,...,0,1,0,0,1,0,0,0,0,0
1,2,60369.0,18.625000,2,7.766643,59,6.805396,51,0.874016,0.342520,...,0,1,0,0,0,1,0,0,0,0
2,3,55126.0,34.766667,1,3.632069,6,13.671772,44,0.944882,0.724409,...,0,1,0,0,1,0,0,0,0,0
3,4,51690.0,15.808333,1,5.372942,16,4.333286,50,0.889764,0.444882,...,0,1,0,0,1,0,0,0,0,0
4,5,28347.0,40.941667,3,3.822477,20,5.967121,38,0.744094,0.661417,...,0,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1996,51017.0,38.233333,1,5.454545,34,14.013489,41,0.881890,0.744094,...,0,1,0,0,1,0,0,0,0,0
1996,1997,26576.0,33.358333,2,3.632069,20,8.380497,38,0.944882,0.877953,...,0,0,1,1,0,0,0,0,0,0
1997,1998,28595.0,39.425000,3,7.168218,99,4.626950,36,0.759843,0.744094,...,0,0,1,0,0,0,0,0,1,0
1998,1999,67949.0,5.733333,0,8.936292,26,3.281439,46,0.909449,0.598425,...,0,1,0,0,0,1,0,0,0,0


In [7]:
from scipy.stats import zscore
# Standardize ranges
df['income'] = zscore(df['income'])
df['aspect'] = zscore(df['aspect'])
df['save_rate'] = zscore(df['save_rate'])
df['subscriptions'] = zscore(df['subscriptions'])

In [8]:
#checking the columns
df.columns

Index(['id', 'income', 'aspect', 'subscriptions', 'dist_healthy', 'save_rate',
       'dist_unhealthy', 'age', 'pop_dense', 'retail_dense', 'crime', 'job_11',
       'job_al', 'job_am', 'job_ax', 'job_bf', 'job_by', 'job_cv', 'job_de',
       'job_dz', 'job_e2', 'job_f8', 'job_gj', 'job_gv', 'job_kd', 'job_ke',
       'job_kl', 'job_kp', 'job_ks', 'job_kw', 'job_mm', 'job_nb', 'job_nn',
       'job_ob', 'job_pe', 'job_po', 'job_pq', 'job_pz', 'job_qp', 'job_qw',
       'job_rn', 'job_sa', 'job_vv', 'job_zz', 'area_a', 'area_b', 'area_c',
       'area_d', 'product_a', 'product_b', 'product_c', 'product_d',
       'product_e', 'product_f', 'product_g'],
      dtype='object')

In [9]:
# Converting to numpy 
x_columns = df.columns.drop('age').drop('id')
x = df[x_columns].values
y = df['age'].values

In [10]:
#importing module
from sklearn.model_selection import train_test_split
# train/test
x_train, x_test, y_train, y_test = train_test_split(    
    x, y, test_size=0.25, random_state=42)

In [11]:
model = Sequential()
# First hidden layer with 50 units and ReLU activation
model.add(Dense(25, input_dim=x.shape[1], activation='relu'))

# Second hidden layer with 25 units and ReLU activation
model.add(Dense(10, activation='relu'))

# Output layer 
model.add(Dense(1))

In [14]:
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, 
                        patience=5, verbose=2, mode='auto', 
                        restore_best_weights=True)
model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],
          verbose=2,epochs=100)


Epoch 1/100
47/47 - 3s - loss: 0.4966 - accuracy: 0.0000e+00 - val_loss: 0.6038 - val_accuracy: 0.0000e+00 - 3s/epoch - 58ms/step
Epoch 2/100
47/47 - 0s - loss: 0.4096 - accuracy: 0.0000e+00 - val_loss: 0.6133 - val_accuracy: 0.0000e+00 - 377ms/epoch - 8ms/step
Epoch 3/100
47/47 - 0s - loss: 0.4144 - accuracy: 0.0000e+00 - val_loss: 0.6772 - val_accuracy: 0.0000e+00 - 351ms/epoch - 7ms/step
Epoch 4/100
47/47 - 0s - loss: 0.4049 - accuracy: 0.0000e+00 - val_loss: 0.6044 - val_accuracy: 0.0000e+00 - 345ms/epoch - 7ms/step
Epoch 5/100
47/47 - 0s - loss: 0.4000 - accuracy: 0.0000e+00 - val_loss: 0.6020 - val_accuracy: 0.0000e+00 - 359ms/epoch - 8ms/step
Epoch 6/100
47/47 - 0s - loss: 0.3932 - accuracy: 0.0000e+00 - val_loss: 0.6048 - val_accuracy: 0.0000e+00 - 351ms/epoch - 7ms/step
Epoch 7/100
47/47 - 0s - loss: 0.3997 - accuracy: 0.0000e+00 - val_loss: 0.6493 - val_accuracy: 0.0000e+00 - 346ms/epoch - 7ms/step
Epoch 8/100
47/47 - 0s - loss: 0.4097 - accuracy: 0.0000e+00 - val_loss: 0.589

<keras.callbacks.History at 0x27fa401e700>

In [15]:
model.evaluate(x_train,y_train)



[0.3539751470088959, 0.0]

In [16]:
from sklearn import metrics

# Predict
pred = model.predict(x_test)

# Measure MSE error.  
score = metrics.mean_squared_error(pred,y_test)
print("Final score (MSE): {}".format(score))

Final score (MSE): 0.5709116784493817
