# Manner of Death

## Data Process

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn.svm import LinearSVC

from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import cross_val_score

RANDOM_STATE = 666666

In [2]:
# Import Data

df = pd.read_csv('Data/outputData.csv', sep=',')
X = df[['Gender', 'Occupation', 'Country', 'Birth Year', 'Death Year']]
Y = df['Manner of Death']

In [3]:
# LabelEncoder

X_LE = np.zeros((len(X.T), len(X)))
LE = preprocessing.LabelEncoder()
for i in range(len(X.T)):
    LE.fit(X.T.to_numpy()[i])
    X_LE[i] = LE.transform(X.T.to_numpy()[i])
X_LE = X_LE.T

LE = preprocessing.LabelEncoder()
LE.fit(Y.to_numpy())
Y_LE = LE.transform(Y.to_numpy())

In [4]:
# Standzardization

X_S = np.zeros((len(X_LE.T), len(X_LE)))
for i in range(len(X_LE.T)):
    X_S[i] = (X_LE.T[i]-np.min(X_LE.T[i]))/(np.max(X_LE.T[i])-np.min(X_LE.T[i]))
X_S = X_S.T

In [5]:
# Create training and test data sets

x = X_S
y = Y_LE

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = RANDOM_STATE)

## Model

### SVC

In [6]:
# Fitting Linear SVC model

svc_lin = LinearSVC(C = 10, dual = False)
%time svc_lin.fit(x_train, y_train)

# Return the mean accuracy on the given test data and labels.

svc_lin.score(x_test, y_test)

CPU times: total: 1.36 s
Wall time: 1.34 s


0.9293021753176826

In [7]:
# Fitting SVC (kernel='rbf') model

svc_rbf = SVC(kernel =  "rbf", gamma = 1)
%time svc_rbf.fit(x_train, y_train)

# Return the mean accuracy on the given test data and labels.

svc_rbf.score(x_test, y_test)

CPU times: total: 38.8 s
Wall time: 37.6 s


0.9293021753176826

In [8]:
# Fitting SVC (kernel="poly") model

svc_poly = SVC(kernel = "poly", C = 10, degree = 1)
%time svc_poly.fit(x_train, y_train)

# Return the mean accuracy on the given test data and labels.

svc_poly.score(x_test, y_test)

CPU times: total: 18.3 s
Wall time: 18.3 s


0.9293021753176826

### KNN

In [9]:
# Fitting KNN model

kNN_classifier = KNeighborsClassifier(n_neighbors  = 10)
%time kNN_classifier.fit(x_train, y_train)

# Return the mean accuracy on the given test data and labels.

kNN_classifier.score(x_test, y_test)

CPU times: total: 109 ms
Wall time: 118 ms


0.9267714839543398

### Logistic Regression

In [10]:
# Fitting Logistic Regression model

log_reg = LogisticRegression(solver = 'lbfgs', max_iter = 1000)
%time sgd_reg.fit(x_train, y_train)

# Return the mean accuracy on the given test data and labels.
log_reg.score(x_test, y_test)

CPU times: total: 1min 25s
Wall time: 10.7 s


0.9288714193409433

### Voting Classifier

In [11]:
# Fitting Hard Voting Classifier model

hard_voting_clf = VotingClassifier(estimators=[
    
    ('svc_lin', LinearSVC(C = 10, dual = False)),
    ('svc_rbf', SVC(kernel =  "rbf", gamma = 1)),
    ('svc_poly', SVC(kernel = "poly", C = 10, degree = 1)),
    ('knn_clf', KNeighborsClassifier(n_neighbors  = 10)),
    ('log_reg', LogisticRegression(solver = 'lbfgs', max_iter = 1000))],
                             voting='hard')

# Return the mean accuracy on the given test data and labels.

%time hard_voting_clf.fit(x_train, y_train)
hard_voting_clf.score(x_test, y_test)

CPU times: total: 2min 21s
Wall time: 1min 7s


0.9293021753176826