# @author: Roshan Dumaru

In [3]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import cv2
from keras import Sequential
from keras.layers import Dense,Conv2D,Flatten,Dropout,MaxPooling2D,Activation,BatchNormalization
from keras.preprocessing.image import img_to_array
from keras.utils.np_utils import to_categorical

# Loading CSV file

In [4]:
data = pd.concat(map(pd.read_csv,['dataset/dataset2.csv','dataset/dataset3.csv']))
data.tail()

Unnamed: 0,age,gender,pixels
23693,9,1,255 254 253 253 251 240 175 83 71 81 61 73 126...
23694,9,1,3 2 0 1 4 2 1 4 4 3 2 0 1 3 5 15 35 50 62 75 8...
23695,9,1,210 206 206 200 132 57 70 116 110 116 128 154 ...
23696,9,1,8 8 8 10 11 17 30 39 54 73 89 102 113 127 141 ...
23697,9,1,193 158 120 89 63 74 72 61 73 91 108 127 137 1...


In [7]:
#convert pixels into numpy array
data['pixels']=data['pixels'].apply(lambda x:  np.array(x.split(), dtype="float32"))
data.head()

Unnamed: 0,age,gender,pixels
0,100,0,"[96.0, 96.0, 99.0, 100.0, 112.0, 139.0, 142.0,..."
1,100,0,"[157.0, 156.0, 158.0, 161.0, 163.0, 163.0, 167..."
2,100,1,"[230.0, 194.0, 40.0, 28.0, 37.0, 28.0, 12.0, 6..."
3,100,1,"[156.0, 175.0, 171.0, 151.0, 138.0, 141.0, 140..."
4,100,1,"[212.0, 177.0, 172.0, 162.0, 172.0, 182.0, 173..."


In [8]:
# normalizing pixels data
data['pixels'] = data['pixels'].apply(lambda x: x/255)
data.head()

Unnamed: 0,age,gender,pixels
0,100,0,"[0.3764706, 0.3764706, 0.3882353, 0.39215687, ..."
1,100,0,"[0.6156863, 0.6117647, 0.61960787, 0.6313726, ..."
2,100,1,"[0.9019608, 0.7607843, 0.15686275, 0.10980392,..."
3,100,1,"[0.6117647, 0.6862745, 0.67058825, 0.5921569, ..."
4,100,1,"[0.83137256, 0.69411767, 0.6745098, 0.63529414..."


In [9]:
def age_grouper(x):
    if x>=0 and x < 12:
        return 0
    elif x>=12 and x < 24:
        return 1
    elif x>=24 and x < 36:
        return 2
    elif x>=36 and x < 48:
        return 3
    elif x>=48 and x < 60:
        return 4
    else:
        return 5

# convert age to group, (age can be any number)
data['label'] = data['age'].apply(age_grouper)
data.tail()

# data.age_group.unique()

Unnamed: 0,age,gender,pixels,label
23693,9,1,"[1.0, 0.99607843, 0.99215686, 0.99215686, 0.98...",0
23694,9,1,"[0.011764706, 0.007843138, 0.0, 0.003921569, 0...",0
23695,9,1,"[0.8235294, 0.80784315, 0.80784315, 0.78431374...",0
23696,9,1,"[0.03137255, 0.03137255, 0.03137255, 0.0392156...",0
23697,9,1,"[0.75686276, 0.61960787, 0.47058824, 0.3490196...",0


In [10]:
y = data['label'].values
print(y[1].ndim,y.shape,y.size)

0 (47396,) 47396


# Dividing actual data (X) and label data (y)

In [13]:
X = np.array(data['pixels'].tolist())
#data are already in 2d ie [[1st data .......]
#                           [2nd data .......]
#                           [.....]
#                           [last data ......]]

## Converting pixels from 2D to 4D
# X = X.reshape(X.shape[0],64,64,1)
print(X.ndim)

2


In [12]:
print(X[70].ndim,X[70].size,X[70].shape)

1 4096 (4096,)


In [14]:
# y = data['age_group']

X_train, X_test, y_train, y_test = train_test_split ( X, y, test_size=0.20, stratify=y)


In [15]:
print(X_train.ndim,y_train.ndim,X_test.ndim,y_test.ndim)

2 1 2 1


Stratified sampling is a sampling technique where the samples are selected in the same proportion (by dividing the population into groups called 'strata' based on a characteristic) as they appear in the population.

# Lgbm 

In [16]:
import lightgbm as lgb

In [17]:
d_train = lgb.Dataset(X_train,label=y_train)
parameters = {'learning_rate':0.1,'boosting_type':'gbdt','objective':'multiclass','metric':['auc','binary_logloss'],'num_leaves':50,'max_depth':10}

try 'dart' accurate way of boosting 

below 50 is the number of iteration

In [20]:
start = datetime.now()
clf = lgb.train(parameters,d_train,50,verbose_eval=2)
stop = datetime.now()
execution_time_lgbm = stop-start



[LightGBM] [Info] Number of positive: 0, number of negative: 37916
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 997515
[LightGBM] [Info] Number of data points in the train set: 37916, number of used features: 4096
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000000 -> initscore=-34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000000 -> initscore=-34.538776


In [None]:
clf.save('AgeModel/age64_acc_74_val_acc_63.h5')

In [None]:
print("LGBM execution time is : ", execution_time_lgbm)

# Prediction on test data

In [None]:
y_pred_lgbm = clf.predict(X_test)

# testing
Setting threshold

In [None]:
for i in range (0,X_test.shape[0]):
    if y_pred_lgbm[i]>=.5:
        y_pred_lgbm[i]=1
    else:
        y_pred_lgbm[i]=0

In [None]:
cm_lgbm = confusion_matrix(y_test, y_pred_lgbm)
sns.heatmap(cm_lgbm,annot=True)

In [None]:
print("LGBM Execution time",execution_time_lgbm)
print("Accuracy with LGBM",metrics.accuracy_score(y_pred_lgbm,y_test))
print("AUC score with LGBM",metrics.roc_auc_score(y_pred_lgbm,y_test))