In [1]:
import pandas as pd
pd.set_option("display.max_columns",None)

import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

from joblib import load #loading encoder,scaler and selected_features

In [2]:
test_dataset=pd.read_csv("row_test_dataset.csv")

In [3]:
test_dataset

Unnamed: 0,Id,Name,Short description,Gender,Country,Occupation,Birth year,Death year,Manner of death,Age of death
0,Q21536670,David Alexander Craig,(1887-1950) businessman,Male,,Businessperson,1887,1950.0,,63.0
1,Q1418096,Kurt Stössel,German association football player (1907-1978),Male,Germany,Athlete,1907,1978.0,,71.0
2,Q2383112,Peter Warren Dease,Canadian explorer,Male,Canada,Explorer,1788,1863.0,,75.0
3,Q3085138,François Leconte,French sailor,Male,France,Military personnel,1791,1872.0,,81.0
4,Q10947958,Wanyan Dumu,,,,,1090,1129.0,,39.0
...,...,...,...,...,...,...,...,...,...,...
122296,Q52156231,Karl Felix Marx,tekenleraar,Male,,,1877,1955.0,,78.0
122297,Q879352,Bjarne Brustad,Norwegian musician,Male,Norway,Artist,1895,1978.0,,83.0
122298,Q5342805,Edward Everett Cox,"newspaper publisher, Democrat",Male,United States of America,Publisher,1867,1931.0,,64.0
122299,Q4983880,Madeleine Uggla,Swedish musician,Female,Sweden,Artist,1920,2018.0,,98.0


### Feature Engineering

We perform 4 steps to our dataset by below function

1)handle missing vals

2)handle rare categorical features

3)create "Century column"

4)drop unwanted columns

In [4]:

def feature_eng(data):
    
    #handle nans in categorical features-replace nans with missing values
    categorical_nan=[feature for feature in data.columns if data[feature].dtypes=="O" and data[feature].isnull().sum()>=1]
    data[categorical_nan]=data[categorical_nan].fillna("Missing")
    
    #handle nans in numerical features-drop rows because there is only one missing value
    numerical_nan=[feature for feature in data.columns if data[feature].dtypes!="O" and data[feature].isnull().sum()>=1]
    data.dropna(subset=numerical_nan,inplace=True)
    
    #outliers if needed
    
    #handling rare categorical features
    categorical_features=[feature for feature in data.columns if data[feature].dtypes=="O"]
    for feature in categorical_features:
        temp=data.groupby(feature)['Age of death'].count()/len(data)
        temp_df=temp[temp>0.01].index
        data[feature]=np.where(data[feature].isin(temp_df),data[feature],'Rare_var')
        
    
    
    
    #Creating new column "Century" presenting each person‘s century
    time_slot=100
    data["Century"]=np.where((np.ceil(data["Birth year"]/time_slot)*time_slot-data["Birth year"])>
                             (data["Death year"]-np.floor(data["Death year"]/time_slot)*time_slot),
                             (np.floor(data["Birth year"]/100)*100),(np.floor(data["Death year"]/time_slot)*time_slot))
    #drop Id,Name,Short description,Birth year columns 
    data.drop(["Id","Name","Short description","Birth year"],axis=1,inplace=True)

    
    return data


In [5]:
feature_eng_test_dataset=feature_eng(test_dataset)

In [6]:
feature_eng_test_dataset

Unnamed: 0,Gender,Country,Occupation,Death year,Manner of death,Age of death,Century
0,Male,Missing,Businessperson,1950.0,Missing,63.0,1900.0
1,Male,Germany,Athlete,1978.0,Missing,71.0,1900.0
2,Male,Canada,Rare_var,1863.0,Missing,75.0,1800.0
3,Male,France,Military personnel,1872.0,Missing,81.0,1800.0
4,Missing,Missing,Missing,1129.0,Missing,39.0,1100.0
...,...,...,...,...,...,...,...
122296,Male,Missing,Missing,1955.0,Missing,78.0,1900.0
122297,Male,Norway,Artist,1978.0,Missing,83.0,1900.0
122298,Male,United States of America,Rare_var,1931.0,Missing,64.0,1800.0
122299,Female,Sweden,Artist,2018.0,Missing,98.0,1900.0


### one-hot encoding categorical features

In [7]:
from sklearn.preprocessing import OneHotEncoder
from joblib import dump,load

##loading encoder
onehot_encoder=load("onehot_encoder.joblib")

#categorical features
categorical_features=[feature for feature in feature_eng_test_dataset.columns if feature_eng_test_dataset[feature].dtypes=="O"]

#encoded feature array
encoded_array=onehot_encoder.transform(feature_eng_test_dataset[categorical_features])

#convert array into dataframe (41 columns)
encoded_categorical_cols=pd.DataFrame(encoded_array.toarray(),index=feature_eng_test_dataset.index)

#concatanate with training dataframe (41+7 columns)
temp=pd.concat([feature_eng_test_dataset,encoded_categorical_cols],axis=1)

#drop categorical columns(41+7-4)
encoded_test_dataset=temp.drop(categorical_features,axis=1)




In [8]:
encoded_test_dataset

Unnamed: 0,Death year,Age of death,Century,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40
0,1950.0,63.0,1900.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1978.0,71.0,1900.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1863.0,75.0,1800.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1872.0,81.0,1800.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1129.0,39.0,1100.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122296,1955.0,78.0,1900.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
122297,1978.0,83.0,1900.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
122298,1931.0,64.0,1800.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
122299,2018.0,98.0,1900.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Scaling

In [12]:
from sklearn.preprocessing import MinMaxScaler

##loading scaler for features
scaler_feature=load("MinMaxScaler_features.joblib")
scaler_label=load("MinMaxScaler_labels.joblib")

features=[feature for feature in encoded_test_dataset.columns if feature!="Age of death"]
encoded_test_dataset[features]=scaler_feature.transform(encoded_test_dataset[features])

encoded_test_dataset["Age of death"]=scaler_label.transform(np.array(encoded_test_dataset["Age of death"]).reshape(-1,1))




In [14]:
scaled_test_dataset=encoded_test_dataset

In [15]:
scaled_test_dataset

Unnamed: 0,Death year,Age of death,Century,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40
0,0.568284,0.372781,0.57459,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.568284,0.420118,0.57459,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.568284,0.443787,0.57459,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.568284,0.479290,0.57459,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.568284,0.230769,0.57459,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122296,0.568284,0.461538,0.57459,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
122297,0.568284,0.491124,0.57459,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
122298,0.568284,0.378698,0.57459,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
122299,0.568284,0.579882,0.57459,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [16]:
#devide scaled_train_dataset into train and target

test_target=scaled_test_dataset["Age of death"]

test_data=scaled_test_dataset.drop("Age of death",axis=1)


In [17]:
test_data

Unnamed: 0,Death year,Century,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40
0,0.568284,0.57459,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.568284,0.57459,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.568284,0.57459,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.568284,0.57459,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.568284,0.57459,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122296,0.568284,0.57459,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
122297,0.568284,0.57459,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
122298,0.568284,0.57459,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
122299,0.568284,0.57459,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Feature Selection

In [18]:
selected_feature_matrices=load("selected_features.joblib")

In [19]:
selected_feature_matrices

array([False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False,  True, False, False, False, False, False,  True, False,
       False, False, False, False, False,  True,  True, False, False,
       False, False,  True, False,  True,  True, False])

In [20]:
selected_features=test_data.columns[selected_feature_matrices]

In [21]:
selected_features

Index([0, 14, 17, 23, 30, 31, 36, 38, 39], dtype='object')

In [22]:
#final test_data to fit for testing
test_data=test_data[selected_features]

In [23]:
test_data

Unnamed: 0,0,14,17,23,30,31,36,38,39
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
122296,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
122297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
122298,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
122299,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Trained model

In [24]:
from keras.models import Sequential
from keras.layers import Dense,Dropout

model=Sequential()
model.add(Dense(64,input_dim=test_data.shape[1],activation="relu"))
model.add(Dense(128,activation="relu"))
model.add(Dense(64,activation="relu"))
model.add(Dense(1,activation="relu"))

model.compile(loss="mse",metrics=["mae"],optimizer="sgd")
model.summary()

Using TensorFlow backend.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 64)                640       
_________________________________________________________________
dense_2 (Dense)              (None, 128)               8320      
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 17,281
Trainable params: 17,281
Non-trainable params: 0
_________________________________________________________________


2022-07-09 16:34:06.807733: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-09 16:34:06.808695: I tensorflow/core/common_runtime/process_util.cc:115] Creating new thread pool with default inter op setting: 8. Tune using inter_op_parallelism_threads for best performance.


In [25]:
#loading weights
model.load_weights("age_dataset_weights.h5")


In [26]:
#evaluate model
list_of_scalers=model.evaluate(test_data,test_target)

print("mse",list_of_scalers[0])
print("mae",list_of_scalers[1])


mse 0.009180315347781732
mae 0.07639523595571518


In [27]:
#get predictions
scaled_prediction=model.predict(test_data)

In [28]:
scaled_prediction

array([[0.3967547 ],
       [0.41672808],
       [0.41672808],
       ...,
       [0.42339188],
       [0.42903233],
       [0.41672808]], dtype=float32)

### Inverse scaling

In [30]:
predictions=scaler_label.inverse_transform(scaled_prediction)

In [31]:
predictions

array([[67.051544],
       [70.42705 ],
       [70.42705 ],
       ...,
       [71.55323 ],
       [72.50646 ],
       [70.42705 ]], dtype=float32)

### Create and save test dataset with predicted column

In [34]:
dataset_with_predictions=test_dataset.copy()
dataset_with_predictions["Predicted Age of death"]=predictions

In [35]:
dataset_with_predictions

Unnamed: 0,Gender,Country,Occupation,Death year,Manner of death,Age of death,Century,Predicted Age of death
0,Male,Missing,Businessperson,1950.0,Missing,63.0,1900.0,67.051544
1,Male,Germany,Athlete,1978.0,Missing,71.0,1900.0,70.427048
2,Male,Canada,Rare_var,1863.0,Missing,75.0,1800.0,70.427048
3,Male,France,Military personnel,1872.0,Missing,81.0,1800.0,66.461754
4,Missing,Missing,Missing,1129.0,Missing,39.0,1100.0,64.317131
...,...,...,...,...,...,...,...,...
122296,Male,Missing,Missing,1955.0,Missing,78.0,1900.0,64.317131
122297,Male,Norway,Artist,1978.0,Missing,83.0,1900.0,70.427048
122298,Male,United States of America,Rare_var,1931.0,Missing,64.0,1800.0,71.553230
122299,Female,Sweden,Artist,2018.0,Missing,98.0,1900.0,72.506462


In [36]:
dataset_with_predictions.to_csv("dataset_with_predictions.csv")