In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score, make_scorer

In [34]:
df_train = pd.read_csv('persons_pics_train.csv')
X_test = pd.read_csv('persons_pics_reserved.csv')

In [35]:
X_train, y_train = df_train.drop(columns=['label']), df_train['label']

In [9]:
X_tr, X_te, y_tr, y_te = train_test_split(df_train.drop(columns=['label']), df_train['label'], test_size=0.2, 
                                                        random_state=0, stratify=df_train['label'])

In [11]:
model = SVC(kernel='rbf', random_state=0, C=1000, gamma=0.0001, class_weight='balanced')
model.fit(X_train, y_train)

SVC(C=1000, class_weight='balanced', gamma=0.0001, random_state=0)

In [13]:
model.predict(X_test)

array(['George W Bush', 'Ariel Sharon', 'Jacques Chirac',
       'Jacques Chirac', 'Tony Blair', 'Colin Powell', 'Donald Rumsfeld',
       'Colin Powell', 'Tony Blair', 'Gerhard Schroeder',
       'Donald Rumsfeld', 'Hugo Chavez', 'George W Bush', 'Hugo Chavez',
       'Colin Powell', 'George W Bush', 'Ariel Sharon', 'Colin Powell',
       'John Ashcroft', 'Gerhard Schroeder', 'Ariel Sharon',
       'George W Bush', 'Colin Powell', 'George W Bush', 'George W Bush',
       'Donald Rumsfeld', 'Donald Rumsfeld', 'Tony Blair',
       'Serena Williams', 'Jean Chretien', 'George W Bush',
       'George W Bush', 'George W Bush', 'George W Bush', 'Tony Blair',
       'Colin Powell', 'Donald Rumsfeld', 'Jacques Chirac',
       'George W Bush', 'Gerhard Schroeder', 'Colin Powell',
       'Donald Rumsfeld', 'Gerhard Schroeder', 'George W Bush',
       'Tony Blair', 'George W Bush', 'George W Bush', 'Tony Blair',
       'Colin Powell', 'Tony Blair', 'Serena Williams', 'Jean Chretien',
       'Tony

### Маленький score, fuck!

In [36]:
# !pip install hyperopt
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials      # для оптимизации гиперпараметров
from functools import partial 

In [86]:
# scorer = make_scorer(f1_score)
# # Создаем свою функцию для вычисления F1-score
# def my_f1_scoring(y_true,y_pred):
#     return f1_score(y_true, y_pred) 

# my_scorer = make_scorer(my_f1_scoring, greater_is_better=True)

In [103]:
# функции для подбора гиперпараметров
def objective(params, pipeline,  X_train, y_train):
    """
    Кросс-валидация с текущими гиперпараметрами
    :params: гиперпараметры
    :pipeline: модель
    :X_train: матрица признаков
    :y_train: вектор меток объектов
    :return: значение Loss-функции
    """ 

    scores=[]  
    for i in range(5):
        pipeline.set_params(**params)    # задаём модели требуемые параметры
        X_tr, X_te, y_tr, y_te = train_test_split(X_train, y_train, test_size=0.2, 
                                                            random_state=i, stratify=y_train)
        pipeline.fit(X_tr, y_tr)
        score = f1_score(y_te, pipeline.predict(X_te), average='weighted')
        scores.append(score)
    mean_score, std_score = np.mean(scores), np.std(scores)    

    # возвращаем результаты, которые записываются в Trials()
    return   {'loss': -mean_score, 'score': mean_score,'std': std_score, 'params': params, 'status': STATUS_OK}


def get_hp_results(pipeline, pipeline_space, max_evals, obj_func=objective, X=X_train, y=y_train):
    '''
    Подбор оптимальных гиперпараметров с помощью hyperopt, вывод лучших 5 результатов и оптимальной комбинации гиперпараметров
    pipeline - пайплайн модели, 
    pipeline_space - пространство гиперпараметров, 
    max_evals - число итераций подбора оптимальных гиперпараметров
    return: оптимальные гиперпараметры, история подбора гиперпараметров
    '''
    trials = Trials() # сюда будет сохраняться история поиска
    best = fmin( 
                fn=partial(obj_func, pipeline=pipeline, X_train=X, y_train=y), # функция для оптимизации 
                space=pipeline_space,   # пространство поиска гиперпараметров
                algo=tpe.suggest, # алгоритм поиска (байесовская оптимизация)
                max_evals=max_evals,     # число итераций
                trials=trials,    # куда сохранять историю поиска
                rstate=np.random.default_rng(0), # random state
                show_progressbar=True # progressbar
            )
    
    # результаты в виде датафрейма
    results = pd.DataFrame([{**x, **x['params']} for x in  trials.results])
    results.drop(labels=['status', 'loss'], axis=1, inplace=True)
    results.sort_values(by=['score'], ascending=False, inplace=True)
    display(results.head())
    
    #оптимальные значения гиперпараметров
    best_params = results.iloc[0]['params'] 
#     best_params = {key.replace('model__', ''): value for key, value in best_params.items()}
    
    return  best_params, results

In [104]:
#пространство параметров модели
param_space = {
    'C': hp.loguniform('C', low=-2*np.log(10), high=4*np.log(10)),  
    'kernel': hp.choice('kernel', options =['linear', 'poly', 'rbf', 'sigmoid']), 
    'gamma': hp.choice('gamma', options=[1e-1, 1e-2, 1e-3, 1e-4, 1e-5]),
    'class_weight': hp.choice('class_weight', options=[None, 'balanced'])
}

In [105]:
model = SVC(random_state=0)
best_params, history = get_hp_results(model, param_space, max_evals=20)

100%|███████████████████████████████████████████████| 20/20 [04:44<00:00, 14.23s/trial, best loss: -0.7968074662770281]


Unnamed: 0,score,std,params,C,class_weight,gamma,kernel
19,0.796807,0.029439,"{'C': 703.9355487403767, 'class_weight': None,...",703.935549,,0.001,linear
3,0.796807,0.029439,"{'C': 2390.6049172910034, 'class_weight': None...",2390.604917,,1e-05,linear
4,0.796807,0.029439,"{'C': 1.7652539117873987, 'class_weight': 'bal...",1.765254,balanced,0.001,linear
8,0.796807,0.029439,"{'C': 1464.2191293386638, 'class_weight': 'bal...",1464.219129,balanced,0.001,linear
11,0.782743,0.01961,"{'C': 113.4868517733314, 'class_weight': 'bala...",113.486852,balanced,0.001,poly


In [96]:
# objective({'C':1000, 'class_weight':'balanced', 'gamma':0.0001}, model,  X_train, y_train)

In [106]:
best_model = SVC(random_state=0, **best_params)

In [107]:
best_model.fit(X_train, y_train)

SVC(C=703.9355487403767, gamma=0.001, kernel='linear', random_state=0)

In [108]:
best_model.predict(X_test)

array(['George W Bush', 'Ariel Sharon', 'Jacques Chirac',
       'Jacques Chirac', 'Tony Blair', 'Colin Powell', 'Donald Rumsfeld',
       'Colin Powell', 'Tony Blair', 'George W Bush', 'Donald Rumsfeld',
       'Hugo Chavez', 'George W Bush', 'Hugo Chavez', 'Colin Powell',
       'George W Bush', 'Ariel Sharon', 'Colin Powell', 'John Ashcroft',
       'Gerhard Schroeder', 'Ariel Sharon', 'George W Bush',
       'Colin Powell', 'George W Bush', 'George W Bush',
       'Donald Rumsfeld', 'Donald Rumsfeld', 'Tony Blair',
       'Serena Williams', 'Jean Chretien', 'George W Bush',
       'George W Bush', 'George W Bush', 'George W Bush', 'George W Bush',
       'Colin Powell', 'Donald Rumsfeld', 'Jacques Chirac',
       'George W Bush', 'Gerhard Schroeder', 'Colin Powell',
       'Donald Rumsfeld', 'Gerhard Schroeder', 'George W Bush',
       'Tony Blair', 'George W Bush', 'George W Bush', 'Tony Blair',
       'Colin Powell', 'Tony Blair', 'Serena Williams', 'Jean Chretien',
       'Tony 