### Задание

1. Обучить модель на понравившихся данных
2. Создать rest api сервис, к которому можно будет обращаться для получения прогнозов

### Примерные темы

1. Определение токсичности комментария
2. Определение стоимости жилья от разных факторов: площадь жилья, удаленность от центра, etc (здесь же куча идей прогнозирования стоимости чего-либо по его описанию)
3. Темы новости по ее тексту
4. Классификатор рукописных цифр
5. Классификатор картинок (например, определение города и страны по снимку улицы)
6. Определение вероятности наличия сердечно-сосудистых заболеваний по данным первичного осмотра (или что-то подобное)
7. Многое другое (придумайте сами)
8. Прогнозирование рейтинга вопроса на stackoverflow - https://www.kaggle.com/imoore/60k-stack-overflow-questions-with-quality-rate
9. https://www.kaggle.com/russellyates88/suicide-rates-overview-1985-to-2016
10. Тематическое моделирование статей на arxiv - https://www.kaggle.com/Cornell-University/arxiv

### Откуда брать данные

1. kaggle (https://www.kaggle.com/datasets)
2. спарсить самостоятельно, но в этом случае вам может понадобиться разметка (если у вас обучение с учителем)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import dill

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score, classification_report, precision_recall_curve


In [2]:
class Config:
    root_dir = "../datasets/kaggle_datasets/pulsar_data/"
    seed = 21


class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in test_columns:
            if col_ not in self.columns:
                X[col_] = 0
        return X[self.columns]
    

def get_metrics(y_test, probs, fstr=True):
    """
    Функция перехода от вероятностей к меткам классов.
    Для этого нужно подобрать порог - Best_Threshold={thresholds[ix]:.3f},
    после которого мы считаем,
    что объект можно отнести к классу 1 
    (если вероятность больше порога -
    размечаем объект как класс 1,
    если нет - класс 0)

    Args:
        y_test ([type]): [Истинные классы]
        probs ([type]): [Предсказанные вероятности принадлежности к классу]
        fstr (bool, optional): [флаг вывода]. Defaults to True.

    Returns:
        if fstr is True:
            [f'str']: [Выводиться f-string в виде: 
                        f'Best_Threshold={thresholds[ix]:.3f},\n'
                        f'F_Score={fscore[ix]:.3f},\n'
                        f'Precision={precision[ix]:.3f},\n'
                        f'Recall={recall[ix]:.3f},\n'
                        f'Roc_AUC={roc_auc_score(y_test, probs)}']
        else:
            [tuple]: [(
                       thresholds[ix]: float,
                       fscore[ix]: float,
                       precision[ix]: float,
                       recall[ix]: float,
                       roc_auc_score(y_test, probs): float
                       )]
    """
    precision, recall, thresholds = precision_recall_curve(y_test, probs)

    fscore = (2 * precision * recall) / (precision + recall)
    roc = roc_auc_score(y_test, probs)
    ix = np.argmax(fscore)
    if fstr:
        return(f'Best_Threshold:\t{thresholds[ix]:.3f},\n'
               f'F1_Score:\t{(fscore[ix]*100.0):.3f}%,\n'
               f'Roc_AUC:\t{(roc*100.0):.3f}%,\n'
               f'Precision:\t{(precision[ix]*100.0):.3f}%,\n'
               f'Recall: \t{(recall[ix]*100.0):.3f}%')
    else:
        return thresholds[ix], fscore[ix], roc, precision[ix], recall[ix]
 

In [3]:
train_df = pd.read_csv(f"{Config.root_dir}pulsar_data_train.csv")
test_df = pd.read_csv(f"{Config.root_dir}pulsar_data_test.csv")
target = train_df.columns[-1]

In [4]:
train_df.columns, test_df.columns

(Index([' Mean of the integrated profile',
        ' Standard deviation of the integrated profile',
        ' Excess kurtosis of the integrated profile',
        ' Skewness of the integrated profile', ' Mean of the DM-SNR curve',
        ' Standard deviation of the DM-SNR curve',
        ' Excess kurtosis of the DM-SNR curve', ' Skewness of the DM-SNR curve',
        'target_class'],
       dtype='object'),
 Index([' Mean of the integrated profile',
        ' Standard deviation of the integrated profile',
        ' Excess kurtosis of the integrated profile',
        ' Skewness of the integrated profile', ' Mean of the DM-SNR curve',
        ' Standard deviation of the DM-SNR curve',
        ' Excess kurtosis of the DM-SNR curve', ' Skewness of the DM-SNR curve',
        'target_class'],
       dtype='object'))

#### Наблюдаются неудобные пробелы перед именами колонок

In [5]:
train_df.columns = map(str.strip, train_df.columns.to_list())
test_df.columns = map(str.strip, test_df.columns.to_list())

In [6]:
train_df.columns, test_df.columns

(Index(['Mean of the integrated profile',
        'Standard deviation of the integrated profile',
        'Excess kurtosis of the integrated profile',
        'Skewness of the integrated profile', 'Mean of the DM-SNR curve',
        'Standard deviation of the DM-SNR curve',
        'Excess kurtosis of the DM-SNR curve', 'Skewness of the DM-SNR curve',
        'target_class'],
       dtype='object'),
 Index(['Mean of the integrated profile',
        'Standard deviation of the integrated profile',
        'Excess kurtosis of the integrated profile',
        'Skewness of the integrated profile', 'Mean of the DM-SNR curve',
        'Standard deviation of the DM-SNR curve',
        'Excess kurtosis of the DM-SNR curve', 'Skewness of the DM-SNR curve',
        'target_class'],
       dtype='object'))

### Проверка данных

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12528 entries, 0 to 12527
Data columns (total 9 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Mean of the integrated profile                12528 non-null  float64
 1   Standard deviation of the integrated profile  12528 non-null  float64
 2   Excess kurtosis of the integrated profile     10793 non-null  float64
 3   Skewness of the integrated profile            12528 non-null  float64
 4   Mean of the DM-SNR curve                      12528 non-null  float64
 5   Standard deviation of the DM-SNR curve        11350 non-null  float64
 6   Excess kurtosis of the DM-SNR curve           12528 non-null  float64
 7   Skewness of the DM-SNR curve                  11903 non-null  float64
 8   target_class                                  12528 non-null  float64
dtypes: float64(9)
memory usage: 881.0 KB


#### Проверка наличия пропусклов в одинаковых столбцах 

In [8]:
train_df.isnull().any().to_list()[:-1] == test_df.isnull().any().to_list()[:-1]

True

In [9]:
np.isnan(train_df).sum()

Mean of the integrated profile                     0
Standard deviation of the integrated profile       0
Excess kurtosis of the integrated profile       1735
Skewness of the integrated profile                 0
Mean of the DM-SNR curve                           0
Standard deviation of the DM-SNR curve          1178
Excess kurtosis of the DM-SNR curve                0
Skewness of the DM-SNR curve                     625
target_class                                       0
dtype: int64

Заполнение пропущенных значений на среднее значение по столбцам

In [10]:
train_df.fillna(train_df.mean(axis=0), inplace=True)
test_df.fillna(test_df.mean(axis=0), inplace=True)

In [11]:
np.isnan(train_df).sum(), np.isnan(test_df).sum()

(Mean of the integrated profile                  0
 Standard deviation of the integrated profile    0
 Excess kurtosis of the integrated profile       0
 Skewness of the integrated profile              0
 Mean of the DM-SNR curve                        0
 Standard deviation of the DM-SNR curve          0
 Excess kurtosis of the DM-SNR curve             0
 Skewness of the DM-SNR curve                    0
 target_class                                    0
 dtype: int64,
 Mean of the integrated profile                     0
 Standard deviation of the integrated profile       0
 Excess kurtosis of the integrated profile          0
 Skewness of the integrated profile                 0
 Mean of the DM-SNR curve                           0
 Standard deviation of the DM-SNR curve             0
 Excess kurtosis of the DM-SNR curve                0
 Skewness of the DM-SNR curve                       0
 target_class                                    5370
 dtype: int64)

In [12]:
target

'target_class'

In [13]:
features = train_df.columns.to_list()[:-1]
features

['Mean of the integrated profile',
 'Standard deviation of the integrated profile',
 'Excess kurtosis of the integrated profile',
 'Skewness of the integrated profile',
 'Mean of the DM-SNR curve',
 'Standard deviation of the DM-SNR curve',
 'Excess kurtosis of the DM-SNR curve',
 'Skewness of the DM-SNR curve']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(train_df[features], train_df[target], test_size=0.3, random_state=21)

In [15]:
continuous_columns = features.copy()
final_transformers = list()
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('scaler', StandardScaler())
            ])
    final_transformers.append((cont_col, cont_transformer))
    
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [16]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', xgb.XGBClassifier()),
])

In [17]:
pipeline.fit(X_train, y_train)





Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Mean of the integrated '
                                                 'profile',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='Mean '
                                                                                     'of '
                                                                                     'the '
                                                                                     'integrated '
                                                                                     'profile')),
                                                                 ('scaler',
                                                                  StandardScaler())])),
                                                ('Standard deviation of the '
                                           

In [18]:
y_pred_proba = pipeline.predict_proba(X_test)

In [19]:
X_test.columns

Index(['Mean of the integrated profile',
       'Standard deviation of the integrated profile',
       'Excess kurtosis of the integrated profile',
       'Skewness of the integrated profile', 'Mean of the DM-SNR curve',
       'Standard deviation of the DM-SNR curve',
       'Excess kurtosis of the DM-SNR curve', 'Skewness of the DM-SNR curve'],
      dtype='object')

In [20]:
print(get_metrics(y_test, y_pred_proba[:, 1]))

Best_Threshold:	0.200,
F1_Score:	86.707%,
Roc_AUC:	97.076%,
Precision:	88.037%,
Recall: 	85.417%


In [21]:
xgb_features = pd.DataFrame(pipeline[1].feature_importances_, 
                        continuous_columns, 
                        columns=['coefficient'])
features_important = xgb_features.sort_values('coefficient', ascending=False).head(3)
features_important

Unnamed: 0,coefficient
Excess kurtosis of the integrated profile,0.566222
Skewness of the integrated profile,0.140101
Mean of the integrated profile,0.063153


In [22]:
features_important.index.to_list()

['Excess kurtosis of the integrated profile',
 'Skewness of the integrated profile',
 'Mean of the integrated profile']

In [23]:
continuous_columns = features_important.index.to_list().copy()
final_transformers = list()
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('scaler', StandardScaler())
            ])
    final_transformers.append((cont_col, cont_transformer))
    
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [24]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', xgb.XGBClassifier()),
])

In [25]:
pipeline.fit(X_train, y_train)





Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Excess kurtosis of the '
                                                 'integrated profile',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='Excess '
                                                                                     'kurtosis '
                                                                                     'of '
                                                                                     'the '
                                                                                     'integrated '
                                                                                     'profile')),
                                                                 ('scaler',
                                                                  StandardScaler())])),
           

In [26]:
y_pred_proba = pipeline.predict_proba(X_test)

In [27]:
print(get_metrics(y_test, y_pred_proba[:, 1]))

Best_Threshold:	0.350,
F1_Score:	85.231%,
Roc_AUC:	95.875%,
Precision:	88.217%,
Recall: 	82.440%


In [30]:
with open("pulsar_model.dill", "wb") as f:
    dill.dump(pipeline, f)

In [31]:
with open("pulsar_model.dill", 'rb') as f:
		model = dill.load(f)

In [44]:
Excess_kurtosis_of_the_integrated_profile = -0.234571
Skewness_of_the_integrated_profile = -0.699648
Mean_of_the_integrated_profile = 140.562500

In [45]:
preds = model.predict_proba(pd.DataFrame({"Excess kurtosis of the integrated profile": [Excess_kurtosis_of_the_integrated_profile],
													  "Skewness of the integrated profile": [Skewness_of_the_integrated_profile],
													  'Mean of the integrated profile': [Mean_of_the_integrated_profile],}))

In [46]:
preds

array([[9.994132e-01, 5.867855e-04]], dtype=float32)

In [29]:
from time import strftime
description=1
company_profile=1
benefits=1
dt = strftime("[%Y-%b-%d %H:%M:%S]")
print(f'{dt} Data: description={description}, company_profile={company_profile}, benefits={benefits}')

[2021-May-30 13:57:58] Data: description=1, company_profile=1, benefits=1
