In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
%matplotlib inline

In [17]:
data_train = pd.read_csv('invasion.csv')

In [18]:
data_train.head()

Unnamed: 0,class,g_reflection,i_reflection,speed,brightness,time_of_observance,volume
0,transport,2.190672,6.716633,62.168208,0.347465,158221,44.932446
1,transport,3.453276,8.995909,62.994707,0.590094,385972,41.5683
2,transport,2.432994,6.938691,62.245807,0.329288,446482,40.123467
3,fighter,6.083763,3.019459,18.474555,0.174738,210125,11.384865
4,fighter,12.876769,2.45295,195.805771,0.150446,23109,11.328806


In [27]:
data_train.shape

(500, 7)

In [23]:
X_train = data_train.drop('class', axis=1)
y_train = data_train['class']

In [25]:
clf_rf = RandomForestClassifier()

In [26]:
# исследуемые гиперпараметры в предполагаемом диапазоне
parametrs = {'n_estimators':range(10,51,10), 'max_depth':range(1,11), \
             'min_samples_leaf':range(1,8), 'min_samples_split':range(2,10,2)}

In [31]:
# подбор оптимальных гиперпараметров
grid_search_cv_clf = GridSearchCV(clf_rf, parametrs, cv=3, n_jobs=-1 )

In [32]:
# обучение модели на тренировочном датасете
grid_search_cv_clf.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [35]:
# определяем лучшего 
best_clf = grid_search_cv_clf.best_estimator_

In [36]:
data_test = pd.read_csv('operative_information.csv')

In [38]:
X_test = data_test

In [39]:
# предсказание классов кораблей
y_pred = best_clf.predict(X_test)

In [61]:
# формирование и подсчет по типу корабля
y_test = pd.Series(y_pred).value_counts()

In [62]:
y_test

fighter      675
transport    595
cruiser      230
dtype: int64

In [77]:
# выявление самой влиятельной фичи
feature_importance = best_clf.feature_importances_

In [72]:
# оборачиваем в датафрейм
feature_importance_df = pd.DataFrame({'features': list(X_test), 'importance':feature_importance})
feature_importance_df

Unnamed: 0,features,importance
0,g_reflection,0.16445
1,i_reflection,0.055158
2,speed,0.157368
3,brightness,0.287326
4,time_of_observance,0.0
5,volume,0.335697


In [73]:
# сортируем по возрастанию
feature_importance_df.sort_values('importance', ascending=False)

Unnamed: 0,features,importance
5,volume,0.335697
3,brightness,0.287326
0,g_reflection,0.16445
2,speed,0.157368
1,i_reflection,0.055158
4,time_of_observance,0.0


выявить опасные регионы космоса, где могут находиться жукеры.

In [78]:
data_space_can_be_a_dangerous_place = pd.read_csv('space_can_be_a_dangerous_place.csv')

In [86]:
data_space_can_be_a_dangerous_place.head()

Unnamed: 0,r,phi,peradventure_index,dustiness,black_hole_is_near,buggers_were_noticed,nearby_system_has_planemo,dangerous
0,169.1,138.0,22.3212,0.706285,0,1,1,1
1,11.1,148.0,1.4652,-0.410512,1,1,1,1
2,274.6,201.0,36.2472,0.756457,1,1,1,1
3,172.8,173.0,22.8096,0.035221,1,1,1,1
4,223.3,222.0,29.4756,0.197271,0,1,1,1


In [96]:
# проверка корреляции buggers_were_noticed по отношению к другим фичам.
data_space_can_be_a_dangerous_place.corr()

Unnamed: 0,r,phi,peradventure_index,dustiness,black_hole_is_near,buggers_were_noticed,nearby_system_has_planemo,dangerous
r,1.0,0.001023,1.0,-0.003336,0.002337,0.001797,0.000326,0.005788
phi,0.001023,1.0,0.001023,0.001921,0.002875,-0.004726,-0.000803,0.003035
peradventure_index,1.0,0.001023,1.0,-0.003336,0.002337,0.001797,0.000326,0.005788
dustiness,-0.003336,0.001921,-0.003336,1.0,-0.004691,-0.00129,0.00337,-0.000499
black_hole_is_near,0.002337,0.002875,0.002337,-0.004691,1.0,-0.001874,0.002661,0.052968
buggers_were_noticed,0.001797,-0.004726,0.001797,-0.00129,-0.001874,1.0,-0.003031,0.344469
nearby_system_has_planemo,0.000326,-0.000803,0.000326,0.00337,0.002661,-0.003031,1.0,0.299508
dangerous,0.005788,0.003035,0.005788,-0.000499,0.052968,0.344469,0.299508,1.0
