### Обучение пайплайна

1. Загрузим данные https://www.kaggle.com/shivamb/real-or-fake-fake-jobposting-prediction
2. Соберем пайплайн с простейшим препроцессингом (tfidf) на текстовых данных
3. Обучим логистическую регрессию и сохраним на диск предобученный пайплайн

In [9]:
import pandas as pd
import dill
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
#from sklearn.metrics import roc_auc_score,roc_curve,scorer
from sklearn.metrics import f1_score
#working with text
from sklearn.feature_extraction.text import TfidfVectorizer
#normalizing data
from sklearn.preprocessing import StandardScaler
#pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
#from sklearn.metrics import precision_score,recall_score
#imputer
from sklearn.impute import SimpleImputer

import sklearn.datasets

Загрузим данные

In [4]:
df = pd.read_csv("C:\\Users\\Vera\\Desktop\\Lection9\\news.csv")
df.head(3)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL


Разделим данные на train/test и сохраним тестовую выборку на диск (здесь мы ее касаться уже не будем)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df, 
                                                    df['label'], test_size=0.33, random_state=42)
#save test
X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)
#save train
X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

In [6]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class TextImputer(BaseEstimator, TransformerMixin):
    def __init__(self, key, value):
        self.key = key
        self.value = value
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X[self.key] = X[self.key].fillna(self.value)
        return X

In [7]:
features = ['title', 'text']
target = 'label'

Соберем кусок, ответственный за feature engineering

In [10]:
#combine
title = Pipeline([
                ('imputer', TextImputer('title', '')),
                ('selector', ColumnSelector(key='title')),
                ('tfidf', TfidfVectorizer(max_df=0.9, min_df=10))
            ])
text = Pipeline([
                ('imputer', TextImputer('text', '')),
                ('selector', ColumnSelector(key='text')),
                ('tfidf', TfidfVectorizer(max_df=0.9, min_df=10))
            ])


feats = FeatureUnion([('title', title),
                      ('text', text)])

Добавим простейший классификатор

In [11]:
%%time

pipeline = Pipeline([
    ('features',feats),
    ('classifier', LogisticRegression()),
])

pipeline.fit(X_train, y_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[self.key] = X[self.key].fillna(self.value)


Wall time: 1.92 s


Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('title',
                                                 Pipeline(steps=[('imputer',
                                                                  TextImputer(key='title',
                                                                              value='')),
                                                                 ('selector',
                                                                  ColumnSelector(key='title')),
                                                                 ('tfidf',
                                                                  TfidfVectorizer(max_df=0.9,
                                                                                  min_df=10))])),
                                                ('text',
                                                 Pipeline(steps=[('imputer',
                                                                  TextImputer(ke

Посмотрим, как выглядит наш pipeline

In [12]:
pipeline.steps

[('features',
  FeatureUnion(transformer_list=[('title',
                                  Pipeline(steps=[('imputer',
                                                   TextImputer(key='title',
                                                               value='')),
                                                  ('selector',
                                                   ColumnSelector(key='title')),
                                                  ('tfidf',
                                                   TfidfVectorizer(max_df=0.9,
                                                                   min_df=10))])),
                                 ('text',
                                  Pipeline(steps=[('imputer',
                                                   TextImputer(key='text',
                                                               value='')),
                                                  ('selector',
                                               

Сохраним модель (пайплайн)

In [13]:
with open("logreg_pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)