### Обучение пайплайна
1. Загрузим данные 
	"https://www.kaggle.com/datatattle/covid-19-nlp-text-classification"
2. Соберем пайплайн с простейшим препроцессингом (tfidf) на текстовых данных
3. Обучим логистическую регрессию и сохраним на диск предобученный пайплайн

In [1]:
import numpy as np
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import pickle
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Home\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Home\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
import pandas as pd
df = pd.read_csv('./data/Corona_NLP_train.csv', encoding='ISO-8859-1')
df['OriginalTweet'].head(1)

0    @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...
Name: OriginalTweet, dtype: object

In [3]:
X_trein = df['OriginalTweet']
Y_train = df['Sentiment']

In [4]:
df['Sentiment'].unique()

array(['Neutral', 'Positive', 'Extremely Negative', 'Negative',
       'Extremely Positive'], dtype=object)

In [5]:
Sentiment_dictionary = {'Neutral': 0, 'Positive': 1,
                        'Extremely Negative': 2, 'Negative': 3,
                        'Extremely Positive': 4}
df['target'] = df['Sentiment'].apply(lambda x: Sentiment_dictionary[x])
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,target
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,0
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive,1
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive,1
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive,1
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative,2


In [6]:
from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i,'',input_txt)
    return input_txt
df['Tweet'] = np.vectorize(remove_pattern)(df['OriginalTweet'], '@[\w]*')
df['Tweet'] = df['Tweet'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])
df['Tweet'] = df['Tweet'].str.replace('[^a-zA-Z#]+',' ')
df['Tweet'] = [stemmer.lemmatize(word) for word in df['Tweet']]
df['Tweet'] = df['Tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 2]))
df['Tweet'] = df['Tweet'].apply(lambda x: x.split())
df['Tweet'] = df['Tweet'].apply(lambda x: ''.join(w+" " for w in x))


  df['Tweet'] = df['Tweet'].str.replace('[^a-zA-Z#]+',' ')


In [7]:
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,target,Tweet
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,0,
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive,1,advice Talk your neighbours family exchange ph...
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive,1,Coronavirus Australia Woolworths give elderly ...
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive,1,food stock not the only one which empty PLEASE...
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative,2,ready supermarket during the #COVID outbreak N...


In [8]:
df["Tweet"].describe()

count     41157
unique    40664
top            
freq        192
Name: Tweet, dtype: object

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, df['target'], test_size=0.2, random_state=0)


In [10]:
from sklearn.base import BaseEstimator, TransformerMixin
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class TextImputer(BaseEstimator, TransformerMixin):
    def __init__(self, key, value):
        self.key = key
        self.value = value
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X[self.key] = X[self.key].fillna(self.value)
        return X
        


In [11]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
tweet = Pipeline([
                ('imputer', TextImputer('Tweet', '')),
                ('selector', ColumnSelector(key='Tweet')),
                ('tfidf', TfidfVectorizer(max_df=0.9, min_df=10))
            ])

feats = FeatureUnion([('tweet', tweet)])

In [12]:
df['Tweet']

0                                                         
1        advice Talk your neighbours family exchange ph...
2        Coronavirus Australia Woolworths give elderly ...
3        food stock not the only one which empty PLEASE...
4        ready supermarket during the #COVID outbreak N...
                               ...                        
41152    Airline pilots offering stock supermarket shel...
41153    Response complaint not provided citing COVID r...
41154    You know getting tough when rationing toilet p...
41155    wrong that the smell hand sanitizer starting t...
41156    Well new used Rift are going for Amazon althou...
Name: Tweet, Length: 41157, dtype: object

In [16]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
pipeline = Pipeline([
    ('features',feats),
    ('classifier',SGDClassifier(random_state=42)),
])

pipeline.fit(X_train, y_train)

Wall time: 1.09 s


Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('tweet',
                                                 Pipeline(steps=[('imputer',
                                                                  TextImputer(key='Tweet',
                                                                              value='')),
                                                                 ('selector',
                                                                  ColumnSelector(key='Tweet')),
                                                                 ('tfidf',
                                                                  TfidfVectorizer(max_df=0.9,
                                                                                  min_df=10))]))])),
                ('classifier', SGDClassifier(random_state=42))])

In [17]:
import dill
with open("SGDClassifier_pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)