# Age of Death

## Data Process

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn.svm import LinearSVC

from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import cross_val_score

RANDOM_STATE = 666666

In [2]:
# Import Data

df = pd.read_csv('Data/outputData.csv', sep=',')
df = df.dropna(subset = ['Age of Death'])
X = df[['Gender', 'Occupation', 'Country', 'Birth Year']]
Y = df['Age of Death']

In [3]:
# LabelEncoder

X_LE = np.zeros((len(X.T), len(X)))
LE = preprocessing.LabelEncoder()
for i in range(len(X.T)):
    LE.fit(X.T.to_numpy()[i])
    X_LE[i] = LE.transform(X.T.to_numpy()[i])
X_LE = X_LE.T

Y_np = Y.to_numpy()
for i in range(10, 110, 5):
    Y_np[np.where((Y_np >= i) & (Y_np < i+5))] = np.array([i for j in range(len(Y_np[np.where((Y_np >= i) & (Y_np < i+5))]))])
Y_np[np.where(Y_np < 10)] = np.array([5 for j in range(len(Y_np[np.where(Y_np < 10)]))])
Y_np[np.where(Y_np >= 115)] = np.array([115 for j in range(len(Y_np[np.where(Y_np >= 115)]))])

In [4]:
# Standzardization

X_S = np.zeros((len(X_LE.T), len(X_LE)))
for i in range(len(X_LE.T)):
    X_S[i] = (X_LE.T[i]-np.min(X_LE.T[i]))/(np.max(X_LE.T[i])-np.min(X_LE.T[i]))
X_S = X_S.T

In [5]:
# Create training and test data sets

x = X_S
y = Y_np.astype(int)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = RANDOM_STATE)

## Model

### SVM

In [6]:
# Fitting Linear SVC model

svc_lin = LinearSVC(C = 100, dual = False)
%time svc_lin.fit(x_train, y_train)

# Return the mean accuracy on the given test data and labels.

svc_lin.score(x_test, y_test)

CPU times: total: 1.05 s
Wall time: 1.05 s


0.29712993376770236

In [7]:
# Fitting SVC (kernel='rbf') model

svc_rbf = SVC(kernel =  "rbf", gamma = 2)
%time svc_rbf.fit(x_train, y_train)

# Return the mean accuracy on the given test data and labels.

svc_rbf.score(x_test, y_test)

CPU times: total: 1min 12s
Wall time: 1min 11s


0.6145603360077541

In [8]:
# Fitting SVC (kernel="poly") model

svc_poly = SVC(kernel = "poly", C = 100, degree = 2)
%time svc_poly.fit(x_train, y_train)

# Return the mean accuracy on the given test data and labels.

svc_poly.score(x_test, y_test)

CPU times: total: 6min 42s
Wall time: 6min 42s


0.618383501157719

### KNN

In [9]:
# Fitting KNN model

kNN_classifier = KNeighborsClassifier(n_neighbors  = 100)
%time kNN_classifier.fit(x_train, y_train)

# Return the mean accuracy on the given test data and labels.

kNN_classifier.score(x_test, y_test)

CPU times: total: 93.8 ms
Wall time: 93 ms


0.5489203597006085

### Logistic Regression

In [10]:
# Fitting Logistic Regression model

log_reg = LogisticRegression(solver = 'lbfgs', max_iter = 1e6)
%time log_reg.fit(x_train, y_train)

# Return the mean accuracy on the given test data and labels.

log_reg.score(x_test, y_test)

CPU times: total: 3min 51s
Wall time: 28.9 s


0.5023423617468096

### Voting Classifier

In [11]:
# Fitting Hard Voting Classifier model

hard_voting_clf = VotingClassifier(estimators=[
    ('svc_lin', LinearSVC(C = 100, dual = False)),
    ('svc_rbf', SVC(kernel =  "rbf", gamma = 2)),
    ('svc_poly', SVC(kernel = "poly", C = 100, degree = 2)),
    ('knn_clf', KNeighborsClassifier(n_neighbors  = 100)),
    ('log_reg', LogisticRegression(solver = 'lbfgs', max_iter = 1e6))],
                             voting='hard')

# Return the mean accuracy on the given test data and labels.

%time hard_voting_clf.fit(x_train, y_train)
hard_voting_clf.score(x_test, y_test)

CPU times: total: 11min 45s
Wall time: 8min 21s


0.6111141026331377

## Cross Validation

In [12]:
%time scores_svc_lin = cross_val_score(svc_lin, x, y, cv = 4)
scores_svc_lin_mean = scores_svc_lin.mean()
%time scores_svc_rbf = cross_val_score(svc_rbf, x, y, cv = 4)
scores_svc_rbf_mean = scores_svc_rbf.mean()
%time scores_svc_poly = cross_val_score(svc_poly, x, y, cv = 4)
scores_svc_poly_mean = scores_svc_poly.mean()
%time scores_kNN = cross_val_score(kNN_classifier, x, y, cv = 4)
scores_kNN_mean = scores_kNN.mean()
%time scores_log_reg = cross_val_score(log_reg, x, y, cv = 4)
scores_log_reg_mean = scores_log_reg.mean()
%time scores_hard_voting_clf = cross_val_score(hard_voting_clf, x, y, cv = 4)
scores_hard_voting_clf_mean = scores_hard_voting_clf.mean()

CPU times: total: 8.27 s
Wall time: 3.13 s
CPU times: total: 12min 33s
Wall time: 12min 31s
CPU times: total: 27min 3s
Wall time: 27min 3s
CPU times: total: 5.2 s
Wall time: 5.2 s
CPU times: total: 13min 13s
Wall time: 1min 39s
CPU times: total: 55min 15s
Wall time: 42min 46s


In [14]:
print(scores_svc_lin_mean)
print(scores_svc_rbf_mean)
print(scores_svc_poly_mean)
print(scores_kNN_mean)
print(scores_log_reg_mean)
print(scores_hard_voting_clf_mean)

0.2962575497955076
0.6007432480754578
0.6057833569646002
0.5315489688317692
0.4978620213453875
0.5964892642015793
