In [36]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import LinearSVC, SVC
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from sklearn.metrics import classification_report, f1_score

In [43]:
train = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/synthetic_classification/train.csv', usecols=range(1, 17))
test = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/synthetic_classification/test.csv', usecols=range(1, 16))
submission = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/synthetic_classification/sample_submission.csv')

In [8]:
print(train.shape)
train.head()

(1125, 16)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,label
0,0.165202,-0.47852,-2.257107,-0.553981,1.765257,1.299021,-0.046894,0.666681,0.68069,-1.671282,-0.153746,-0.195847,-0.220067,2.566026,-1.570657,2
1,-0.224334,-0.767142,-0.240347,-0.051352,0.305125,-1.721982,-2.465773,-1.101852,0.897459,-1.20632,0.729931,0.403281,-0.005451,-1.3958,-1.634428,0
2,0.089218,0.399756,0.947436,0.319036,0.688324,1.496136,0.191788,0.007435,0.163002,-1.375127,0.187362,-0.016887,0.766921,-1.150965,0.379291,4
3,-0.195872,0.507818,-1.799445,-1.169515,-0.923603,0.183832,-0.424816,0.592742,-0.075206,-0.84536,-0.927943,-0.328885,-0.701013,-0.630973,0.405728,1
4,-0.24367,0.079381,-0.780691,-2.072413,-1.380842,-0.140338,0.497398,-1.527575,-0.03495,-1.53909,-0.581833,0.441491,-0.140839,0.120535,0.10558,2


In [10]:
print(submission.shape)
submission.head()

(375, 1)


Unnamed: 0,y
0,4
1,4
2,1
3,4
4,1


# Визуализация

In [None]:
!pip install umap-learn

In [28]:
import umap

umap = umap.UMAP(n_neighbors=9, min_dist=0.7)
transformed_features = umap.fit_transform(train[['{}'.format(i) for i in range(0, 15)]])

In [29]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook
output_notebook()

def draw_vectors(x, y, radius=10, alpha=0.25, color='blue',
                 width=600, height=400, show=True, **kwargs):
    """ draws an interactive plot for data points with auxilirary info on hover """
    data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })

    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)

    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show: pl.show(fig)
    return fig

In [30]:
draw_vectors(
    transformed_features[:, 0], 
    transformed_features[:, 1], 
    color=[["red", "blue", "green", "yellow", "gray"][t] for t in train['label']]
    )

In [53]:
parameters = {'kernel':('linear', 'poly', 'rbf', 'sigmoid'), 'C':[0.001, 0.01, 0.1, 1., 10., 100.]}
svc = SVC(random_state=123)
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train[['{}'.format(i) for i in range(0, 15)]], train['label'])

pprint(sorted(clf.cv_results_.keys()))
best_params = clf.best_params_
pprint(best_params)

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_C',
 'param_kernel',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']
{'C': 0.1, 'kernel': 'linear'}


In [54]:
train[['{}'.format(i) for i in range(0, 15)]]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.165202,-0.478520,-2.257107,-0.553981,1.765257,1.299021,-0.046894,0.666681,0.680690,-1.671282,-0.153746,-0.195847,-0.220067,2.566026,-1.570657
1,-0.224334,-0.767142,-0.240347,-0.051352,0.305125,-1.721982,-2.465773,-1.101852,0.897459,-1.206320,0.729931,0.403281,-0.005451,-1.395800,-1.634428
2,0.089218,0.399756,0.947436,0.319036,0.688324,1.496136,0.191788,0.007435,0.163002,-1.375127,0.187362,-0.016887,0.766921,-1.150965,0.379291
3,-0.195872,0.507818,-1.799445,-1.169515,-0.923603,0.183832,-0.424816,0.592742,-0.075206,-0.845360,-0.927943,-0.328885,-0.701013,-0.630973,0.405728
4,-0.243670,0.079381,-0.780691,-2.072413,-1.380842,-0.140338,0.497398,-1.527575,-0.034950,-1.539090,-0.581833,0.441491,-0.140839,0.120535,0.105580
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120,-0.921957,-0.287242,0.081046,-1.439224,1.819679,0.331452,0.019260,0.779340,-0.767966,1.236125,-0.412199,-0.042225,-1.143249,0.013138,-1.352734
1121,-0.949648,0.412262,1.359029,0.478389,-0.833918,0.325484,0.339922,0.268013,0.861272,0.939260,0.358845,0.871524,1.052642,-0.506099,-0.398804
1122,-0.159522,-0.944065,-1.905598,-0.557243,-1.317737,0.439101,1.409078,-0.479984,1.524758,0.379914,-1.313705,0.589540,-0.408532,-1.348533,0.286601
1123,0.247274,-0.191863,-1.120644,1.036783,-0.611240,-1.256217,0.190050,0.193292,-1.173136,-1.425713,0.367132,1.724667,0.053174,2.378303,-0.523210


In [55]:
model = SVC(random_state=123).set_params(**best_params)
model.fit(train[['{}'.format(i) for i in range(0, 15)]], train['label'])
print("Train f1-score - ", f1_score(model.predict(train[['{}'.format(i) for i in range(0, 15)]]), train['label'], average='macro'))

Train f1-score -  0.38902507273483794


In [56]:
pd.DataFrame(model.predict(test[['{}'.format(i) for i in range(0, 15)]]), columns=['y']).to_csv('submission.csv', index=False)