In [None]:
import pandas as pd
import numpy as np

from nltk.stem import SnowballStemmer

from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [None]:
df = pd.read_pickle('../../data/train.pkl')
df

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['sentence'].values, df['label'].values, random_state=0)

In [None]:
sb_stemmer = SnowballStemmer(language='english')

class Snowball_TfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(Snowball_TfidfVectorizer, self).build_analyzer()
        return lambda doc:([sb_stemmer.stem(word) for word in analyzer(doc)])

In [None]:
#pipline
pipeline = Pipeline([
        ('features', FeatureUnion([
            ('counts', Snowball_CountVectorizer())
        ])),
        ('classifier', LogisticRegression(solver='liblinear'))
    ])

In [None]:
parameters = {
              'features__counts__min_df' : [1, 2, 3, 5],
              'features__counts__max_df' : [10, 20, 25, 30, 40],
              'features__counts__stop_words' : ['english', None],
              'features__counts__ngram_range' :[(1,1), (1,2), (1,3)],
              'features__counts__lowercase' : [True, False]
             }

In [None]:
clf = GridSearchCV(pipeline, parameters, cv=5, n_jobs=6)

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.best_estimator_.get_params()

In [None]:
max(clf.cv_results_['mean_test_score'])

In [None]:
with open('best_params_logreg.txt', 'w') as file:
     file.write(str(clf.best_estimator_.get_params()))