In [1]:
acc_test = []
acc_train = []
acc_valid = []

In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier, Perceptron
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
df = pd.read_csv("Day2_df.csv").drop(['Unnamed: 0', 'index'], axis = 1)
df_test = pd.read_csv("Day2_test.csv").drop(['Unnamed: 0', 'index'], axis = 1)
df = pd.concat([df, df_test], axis=0).drop_duplicates()
df.head()

Unnamed: 0,Activity,Age,Age_range,Children,Education,Family,IsAlone,Longevity,Pet,Sex,Social_status,Sport
0,725,73,3,0,3,1,False,0,1,female,9,0
1,7128,75,4,0,1,1,False,1,1,male,4,1
2,793,73,3,0,3,0,True,1,0,male,9,0
3,5310,74,3,0,1,1,False,1,1,male,3,0
4,805,74,3,0,3,0,True,0,0,female,9,0


In [4]:
df['Sex_int'] = (df['Sex'] == 'male').astype('int')
df['IsAlone_int'] = df['IsAlone'].astype('int')

In [5]:
features = df.drop(['Longevity', 'Sex', 'IsAlone', ], axis = 1)
target = df['Longevity']

Фиксируем псевдослучайности, чтобы можно было повторить эксперементы.

In [6]:
random_sts = [0, 12345, 13579, 24680, 54321, 192837]

Делим выборку на обучающую валидационную и тестовую в соотношении 60:20:20.

In [7]:
features_train, features_, target_train, target_ = train_test_split(
    features, target, test_size = 0.4, random_state = 12345
)

features_valid, features_test, target_valid, target_test = train_test_split(
    features_, target_, test_size = 0.5, random_state = 12345
)

In [8]:
print("Размер обучающей выборки:", features_train.shape[0])
print("Ее доля от исходных данных: {:.0%}".format(features_train.shape[0] / 
                                                  features.shape[0]))
print()
print("Размер валидационной выборки:", features_valid.shape[0])
print("Ее доля от исходных данных: {:.0%}".format(features_valid.shape[0] / 
                                                  features.shape[0]))
print()
print("Размер тестовой выборки:", features_test.shape[0])
print("Ее доля от исходных данных: {:.0%}".format(features_test.shape[0] / 
                                                  features.shape[0]))

Размер обучающей выборки: 458
Ее доля от исходных данных: 60%

Размер валидационной выборки: 153
Ее доля от исходных данных: 20%

Размер тестовой выборки: 153
Ее доля от исходных данных: 20%


### Модель Машины опорных векторов

In [9]:
svc_model = None
svc_accuracy = 0

for st in random_sts:
    for c in range(1, 11):
        model = SVC(random_state=st, C=(c/10))
        model.fit(features_train, target_train)
        predictions = model.predict(features_valid)
        accuracy = accuracy_score(target_valid, predictions)
        
        if accuracy > svc_accuracy:
            svc_model = model
            svc_accuracy = accuracy

In [10]:
print("Гиперпараметры лучшей модели опорных векторов на валидационной выборке:")
print("C:", svc_model.C)
print("random_state:", svc_model.random_state)
print("accuracy:", svc_accuracy)

Гиперпараметры лучшей модели опорных векторов на валидационной выборке:
C: 1.0
random_state: 0
accuracy: 0.5751633986928104


In [11]:
svc_accuracy_test = accuracy_score(target_test, svc_model.predict(features_test))
print("Точность на тестовой выборке:", svc_accuracy_test)

Точность на тестовой выборке: 0.5816993464052288


In [12]:
acc_valid.append(svc_accuracy)
acc_test.append(svc_accuracy_test)
acc_train.append(accuracy_score(target_train, svc_model.predict(features_train)))

### Модель Линейной машины опорных векторов

In [13]:
# TODO Убрать Warning-и

svc_linear_model = None
svc_linear_accuracy = 0

model = LinearSVC(dual = True, C=(6/10), random_state=192837, max_iter = 105000)
model.fit(features_train, target_train)
predictions = model.predict(features_valid)
accuracy = accuracy_score(target_valid, predictions)
if accuracy > svc_linear_accuracy:
    svc_linear_model = model
    svc_linear_accuracy = accuracy



In [14]:
print("Гиперпараметры лучшей модели линейной машины опорных векторов на валидационной выборке:")
print("C:", svc_linear_model.C)
print("random_state:", svc_linear_model.random_state)
print("accuracy:", svc_linear_accuracy)
# При max_iter = 105000 точность максимальна для данного датасета
# Условия вычислены перебором, записаны прямо для экономии времени при запуске

Гиперпараметры лучшей модели линейной машины опорных векторов на валидационной выборке:
C: 0.6
random_state: 192837
accuracy: 0.7908496732026143


In [15]:
svc_accuracy_test = accuracy_score(target_test, svc_linear_model.predict(features_test))
print("Точность на тестовой выборке:", svc_accuracy_test)

Точность на тестовой выборке: 0.8104575163398693


In [16]:
acc_valid.append(svc_linear_accuracy)
acc_test.append(svc_accuracy_test)
acc_train.append(accuracy_score(target_train, svc_linear_model.predict(features_train)))

### Модель k-Ближайших соседей

In [17]:
# Коэффициенты для каждого объекта, слишком большое евклидово расстояние
# Другая конфигурация данных
kn_model = None
kn_accuracy = 0

for n in range(1, 16):
    for alg in ['ball_tree', 'kd_tree', 'brute']:
        for p in range(1, 5):
            model = KNeighborsClassifier(n_neighbors=n, algorithm=alg, p=p)
            model.fit(features_train, target_train)
            predictions = model.predict(features_valid)
            accuracy = accuracy_score(target_valid, predictions)
        
            if accuracy > kn_accuracy:
                kn_model = model
                kn_accuracy = accuracy

In [18]:
print("Гиперпараметры лучшей модели k-Ближайших соседей на валидационной выборке:")
print("n_neighbors:", kn_model.n_neighbors)
print("algorithm:", kn_model.algorithm)
print("accuracy:", kn_accuracy)

Гиперпараметры лучшей модели k-Ближайших соседей на валидационной выборке:
n_neighbors: 9
algorithm: brute
accuracy: 0.6862745098039216


In [19]:
kn_accuracy_test = accuracy_score(target_test, kn_model.predict(features_test))
print("Точность на тестовой выборке:", kn_accuracy_test)

Точность на тестовой выборке: 0.6601307189542484


In [20]:
acc_valid.append(kn_accuracy)
acc_test.append(kn_accuracy_test)
acc_train.append(accuracy_score(target_train, kn_model.predict(features_train)))

### Модель Наивного байесовского классификатора 

In [21]:
gnb_model = GaussianNB()
gnb_model.fit(features_train, target_train)
predictions = model.predict(features_valid)
gnb_accuracy = accuracy_score(target_valid, predictions)

In [22]:
print("Точность модели НБК на валидационной выборке:", gnb_accuracy)

Точность модели НБК на валидационной выборке: 0.6470588235294118


In [23]:
gnb_accuracy_test = accuracy_score(target_test, gnb_model.predict(features_test))
print("Точность на тестовой выборке:", gnb_accuracy_test)

Точность на тестовой выборке: 0.8235294117647058


In [24]:
acc_valid.append(gnb_accuracy)
acc_test.append(gnb_accuracy_test)
acc_train.append(accuracy_score(target_train, gnb_model.predict(features_train)))

### Модель Персептрона

In [25]:
# Равномерно представленные объекты больше подходят для Персептрона

pcp_model = None
pcp_accuracy = 0

for st in random_sts:
    model = Perceptron(random_state=st, class_weight='balanced', shuffle=True)
    model.fit(features_train, target_train)
    predictions = model.predict(features_valid)
    accuracy = accuracy_score(target_valid, predictions)
        
    if accuracy > pcp_accuracy:
        pcp_model = model
        pcp_accuracy = accuracy

In [26]:
print("Гиперпараметры лучшей модели персептрона на валидационной выборке:")
print("random_state:", pcp_model.random_state)
print("accuracy:", pcp_accuracy)

Гиперпараметры лучшей модели персептрона на валидационной выборке:
random_state: 13579
accuracy: 0.6274509803921569


In [27]:
pcp_accuracy_test = accuracy_score(target_test, pcp_model.predict(features_test))
print("Точность на тестовой выборке:", pcp_accuracy_test)

Точность на тестовой выборке: 0.5882352941176471


In [28]:
acc_valid.append(pcp_accuracy)
acc_test.append(pcp_accuracy_test)
acc_train.append(accuracy_score(target_train, pcp_model.predict(features_train)))

### Классификатор на основе метода стохастического градиентного спуска

In [29]:
sgd_model = None
sgd_accuracy = 0

for st in random_sts:
    for l in ['hinge','log','modified_huber','squared_hinge','perceptron',
              'squared_loss','huber','epsilon_insensitive','squared_epsilon_insensitive']:
        for p in ['l2', 'l1', 'elasticnet']:
            model = SGDClassifier(random_state=st, loss=l, penalty=p, learning_rate = 'adaptive', eta0=0.5)
            model.fit(features_train, target_train)
            predictions = model.predict(features_valid)
            accuracy = accuracy_score(target_valid, predictions)
        
            if accuracy > sgd_accuracy:
                sgd_model = model
                sgd_accuracy = accuracy

In [30]:
print("Гиперпараметры лучшей модели SGD на валидационной выборке:")
print("random_state:", sgd_model.random_state)
print("loss:", sgd_model.loss)
print("penalty:", sgd_model.penalty)
print("accuracy:", sgd_accuracy)

Гиперпараметры лучшей модели SGD на валидационной выборке:
random_state: 13579
loss: squared_epsilon_insensitive
penalty: l2
accuracy: 0.6143790849673203


In [31]:
sgd_accuracy_test = accuracy_score(target_test, sgd_model.predict(features_test))
print("Точность на тестовой выборке:", sgd_accuracy_test)

Точность на тестовой выборке: 0.6666666666666666


In [32]:
acc_valid.append(sgd_accuracy)
acc_test.append(sgd_accuracy_test)
acc_train.append(accuracy_score(target_train, sgd_model.predict(features_train)))

### Модель дерева решений

In [33]:
# В верхнем узле необходимо подобрать максимально определяющее условие, делящее выборку на два класса
# Визуализировать графом через pandas

tree_model = None
tree_accuracy = 0

for st in random_sts:
    for depth in range(1, 15):
        model = DecisionTreeClassifier(random_state=st, max_depth=depth)
        model.fit(features_train, target_train)
        predictions = model.predict(features_valid)
        accuracy = accuracy_score(target_valid, predictions)
        
        if accuracy > tree_accuracy:
            tree_model = model
            tree_accuracy = accuracy

In [34]:
print("Гиперпараметры лучшей модели дерева решений на валидационной выборке")
print("random_state:", tree_model.random_state)
print("max_depth:", tree_model.max_depth)
print("accuracy:", tree_accuracy)

Гиперпараметры лучшей модели дерева решений на валидационной выборке
random_state: 0
max_depth: 6
accuracy: 0.8104575163398693


In [35]:
tree_accuracy_test = accuracy_score(target_test, tree_model.predict(features_test))
print("Точность на тестовой выборке:", tree_accuracy_test)

Точность на тестовой выборке: 0.8366013071895425


In [36]:
acc_valid.append(tree_accuracy)
acc_test.append(tree_accuracy_test)
acc_train.append(accuracy_score(target_train, tree_model.predict(features_train)))

### Модель случайного леса

In [37]:
forest_model = None
forest_accuracy = 0

for st in random_sts:
    for depth in range(1, 16):
        for tree_cnt in range(1, 16):
            model = RandomForestClassifier(random_state=st, max_depth=depth, 
                                           n_estimators = tree_cnt)
            model.fit(features_train, target_train)
            predictions = model.predict(features_valid)
            accuracy = accuracy_score(target_valid, predictions)
        
            if accuracy > forest_accuracy:
                forest_model = model
                forest_accuracy = accuracy

In [38]:
print("Гиперпараметры лучшей модели случайного леса на валидационной выборке")
print("random_state:", forest_model.random_state)
print("max_depth:", forest_model.max_depth)
print("n_estimators:", forest_model.n_estimators)
print("accuracy:", forest_accuracy)

Гиперпараметры лучшей модели случайного леса на валидационной выборке
random_state: 13579
max_depth: 4
n_estimators: 5
accuracy: 0.8366013071895425


In [39]:
forest_accuracy_test = accuracy_score(target_test, forest_model.predict(features_test))
print("Точность на тестовой выборке:", forest_accuracy_test)

Точность на тестовой выборке: 0.8431372549019608


In [40]:
acc_valid.append(forest_accuracy)
acc_test.append(forest_accuracy_test)
acc_train.append(accuracy_score(target_train, forest_model.predict(features_train)))

Сведем результаты в таблицу.

In [41]:
data ={"model":["SVC","LinearSVC","KNeighborsClassifier","GaussianNB","Perceptron", "SGDClassifier", "DecisionTreeClassifier",
                "RandomForestClassifier"],
       "train_accuracy" : acc_train, "valid_accuracy" : acc_valid, "test_accuracy": acc_test, 
      "mean_accuracy": [(acc_train[i] + acc_valid[i] + acc_test[i]) / 3 for i in range(8)]} 
results = pd.DataFrame(data)

In [42]:
results

Unnamed: 0,model,train_accuracy,valid_accuracy,test_accuracy,mean_accuracy
0,SVC,0.912664,0.575163,0.581699,0.689842
1,LinearSVC,0.78821,0.79085,0.810458,0.796506
2,KNeighborsClassifier,0.713974,0.686275,0.660131,0.686793
3,GaussianNB,0.779476,0.647059,0.823529,0.750021
4,Perceptron,0.576419,0.627451,0.588235,0.597368
5,SGDClassifier,0.615721,0.614379,0.666667,0.632255
6,DecisionTreeClassifier,0.879913,0.810458,0.836601,0.842324
7,RandomForestClassifier,0.864629,0.836601,0.843137,0.848122


In [43]:
results.sort_values(by=['train_accuracy'], ascending=False)

Unnamed: 0,model,train_accuracy,valid_accuracy,test_accuracy,mean_accuracy
0,SVC,0.912664,0.575163,0.581699,0.689842
6,DecisionTreeClassifier,0.879913,0.810458,0.836601,0.842324
7,RandomForestClassifier,0.864629,0.836601,0.843137,0.848122
1,LinearSVC,0.78821,0.79085,0.810458,0.796506
3,GaussianNB,0.779476,0.647059,0.823529,0.750021
2,KNeighborsClassifier,0.713974,0.686275,0.660131,0.686793
5,SGDClassifier,0.615721,0.614379,0.666667,0.632255
4,Perceptron,0.576419,0.627451,0.588235,0.597368


In [44]:
results.sort_values(by=['valid_accuracy'], ascending=False)

Unnamed: 0,model,train_accuracy,valid_accuracy,test_accuracy,mean_accuracy
7,RandomForestClassifier,0.864629,0.836601,0.843137,0.848122
6,DecisionTreeClassifier,0.879913,0.810458,0.836601,0.842324
1,LinearSVC,0.78821,0.79085,0.810458,0.796506
2,KNeighborsClassifier,0.713974,0.686275,0.660131,0.686793
3,GaussianNB,0.779476,0.647059,0.823529,0.750021
4,Perceptron,0.576419,0.627451,0.588235,0.597368
5,SGDClassifier,0.615721,0.614379,0.666667,0.632255
0,SVC,0.912664,0.575163,0.581699,0.689842


In [45]:
results.sort_values(by=['test_accuracy'], ascending=False)

Unnamed: 0,model,train_accuracy,valid_accuracy,test_accuracy,mean_accuracy
7,RandomForestClassifier,0.864629,0.836601,0.843137,0.848122
6,DecisionTreeClassifier,0.879913,0.810458,0.836601,0.842324
3,GaussianNB,0.779476,0.647059,0.823529,0.750021
1,LinearSVC,0.78821,0.79085,0.810458,0.796506
5,SGDClassifier,0.615721,0.614379,0.666667,0.632255
2,KNeighborsClassifier,0.713974,0.686275,0.660131,0.686793
4,Perceptron,0.576419,0.627451,0.588235,0.597368
0,SVC,0.912664,0.575163,0.581699,0.689842


In [46]:
results.sort_values(by=['mean_accuracy'], ascending=False)

Unnamed: 0,model,train_accuracy,valid_accuracy,test_accuracy,mean_accuracy
7,RandomForestClassifier,0.864629,0.836601,0.843137,0.848122
6,DecisionTreeClassifier,0.879913,0.810458,0.836601,0.842324
1,LinearSVC,0.78821,0.79085,0.810458,0.796506
3,GaussianNB,0.779476,0.647059,0.823529,0.750021
0,SVC,0.912664,0.575163,0.581699,0.689842
2,KNeighborsClassifier,0.713974,0.686275,0.660131,0.686793
5,SGDClassifier,0.615721,0.614379,0.666667,0.632255
4,Perceptron,0.576419,0.627451,0.588235,0.597368


### Вывод

Наиболее эффективными по точности на небольшом количестве значений оказались модели ***Дерева решений***, ***Случайного леса***, ***Линейной машины опорных векторов*** и ***Наивного байесовского классификатора*** 