## 0. Feature Engineering
    A. Tokenization
    B. punctuation & stopwords
    C. Lemmatization
    
    D. Missing Value drop
    E. Unbalance - Subsampling
    


## 1. Text to Features
    A. LDA Topic Modeling
    B. Word Embedding (Word2Vec/GloVe)
    C. TF-IDF
    D. Ensemble: 
        a. NER + LDA
        b. Word Embedding + LDA + NER
        
    
    
## 2. Classification
    A. ensemble methods(rf/boosting)
    B. XG Boost
    C. SVM
    D. NN

## Test
get rid of < > (using split )

In [1]:
import pandas as pd
import numpy as np
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.data import load
from nltk.corpus import wordnet


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('tagsets')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Data Import 

In [4]:
rand_state = 48

In [5]:
df = pd.read_csv("train1.csv",index_col = 0)
df.head()

Unnamed: 0_level_0,TITLE,TOPIC
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,RITE AID CORP <RAD> SETS DIVIDEND,0
1,DEL E. WEBB INVESTMENT <DWPA> 4TH QTR NET,0
2,GENERAL HOST CORP <GH> SETS QUARTERLY,0
3,PROFESSOR LIFTS BANC TEXAS <BTX> PREFERRED STAKE,1
4,WINCHELL'S DONUT <WDH> SETS INITIAL QUARTERLY,0


In [6]:
#check if it's unbalance
df.groupby("TOPIC").count()

Unnamed: 0_level_0,TITLE
TOPIC,Unnamed: 1_level_1
0,3107
1,2406
2,2404


In [7]:
#check missing value
df[df["TITLE"].isna()]

Unnamed: 0_level_0,TITLE,TOPIC
ID,Unnamed: 1_level_1,Unnamed: 2_level_1


# Feature Engineering

In [8]:
stop = set(stopwords.words("english"))
punc = set(string.punctuation)
punc.update(["--","jan","january","feb","february","mar","march","apr","april","may","jun","june","jul","july",
             "aug","august","sept","september","oct","october","nov","november","dec","december"])
lemma = WordNetLemmatizer()
treetags = load('help/tagsets/upenn_tagset.pickle').keys()

posdic = dict(map(lambda x: (x, wordnet.ADJ) if x.startswith('J')
 else((x, wordnet.ADV) if x.startswith('R')
    else((x, wordnet.VERB) if x.startswith('V')
         else(x, wordnet.NOUN))), treetags))


In [9]:
def clean(sent):
    sent = sent.lower().replace("<"," ").replace(">", " ").split()
    sent = [lemma.lemmatize(word, posdic[pos]) for (word,pos) in pos_tag(sent) 
            if (word.isalpha()) & (word not in stop) & (word not in punc) ]  
    return " ".join(word for word in sent)

In [10]:
df["FEATURE"] = [clean(row) for row in df["TITLE"]]

# Seperate Modeling

### TFIDF & Topic Modeling

In [11]:
import time
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation as LDA


In [11]:
vectorizer  = CountVectorizer(min_df=2,ngram_range=(1,3))
n_gram = vectorizer.fit_transform(df["FEATURE"])
feature_names = vectorizer.get_feature_names()

In [12]:
n_gram

<7917x10091 sparse matrix of type '<class 'numpy.int64'>'
	with 63471 stored elements in Compressed Sparse Row format>

In [13]:
tfidf_gram = TfidfTransformer(norm = "l1").fit_transform(n_gram)

In [14]:
tfidf_gram

<7917x10091 sparse matrix of type '<class 'numpy.float64'>'
	with 63471 stored elements in Compressed Sparse Row format>

for i in range(2,7):
    lda = LDA(n_components=i,learning_method='batch', max_iter=100,random_state=rand_state).fit(n_gram)
    perplexity = lda.perplexity(n_gram)
    score = lda.score(n_gram)
    print('\nPerplexity of %d topics: ' %i, perplexity)
    print('score: ', score)

In [15]:
lda = LDA(n_components=3,learning_method='batch',random_state=rand_state).fit(n_gram)
lda_data = lda.transform(n_gram)

In [16]:
topics = lda.components_
topics

array([[2.1784386 , 0.33701549, 2.32817181, ..., 0.38196381, 0.41092557,
        2.31784797],
       [0.33486781, 2.32906653, 0.33368499, ..., 3.28391195, 2.15963721,
        0.3484502 ],
       [0.48669359, 0.33391798, 0.33814319, ..., 0.33412424, 0.42943723,
        0.33370182]])

In [17]:
num_words  = 10
for _, topic in enumerate(topics):
    print (",".join([feature_names[i] for i in topic.argsort()[:-num_words - 1:-1]]))

qtr,net,inc,qtr net,corp,loss,year,co,qtr loss,dividend
say,oil,bank,raise,price,unit,rate,pct,buy,set
pct,market,money,mln,stock,rise,see,japan,set,buy


# Training

In [74]:
param_grid = {
        'C': [0.1,1,10,50],
        'kernel': ['linear', 'rbf'],
        'gamma': [0.01, 0.1, 1]     
    }

In [75]:
grid = GridSearchCV(estimator = svm.SVC(), cv=5, n_jobs=-1, param_grid=param_grid, scoring = "accuracy",verbose = 20)

In [None]:
grid.fit(lda_data, y_train)

In [None]:
print_results(clf)

### Pipeline

In [None]:
def print_results(grid):
    print("best score: ", grid.best_score_)
    print("best parameters: ", grid.best_params_)

### Random Forest

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost

In [14]:
X_train = df["FEATURE"]
y_train = df["TOPIC"]

In [15]:
estimators = [('vectorizer', CountVectorizer()), 
              ('tfidf',TfidfTransformer()),
#                ('lda', LDA(learning_method='batch',max_iter=10,random_state=rand_state)), 
               ('rf', RandomForestClassifier())]

In [16]:
pipe = Pipeline(memory=None,steps = estimators)

In [17]:
param_grid = [
    {
        'vectorizer__ngram_range': [(1,4),(1,3)],
        'vectorizer__min_df': [3, 4],
#         'lda__n_components': [5,7,9],
        'tfidf__norm': ['l1','l2'],
        'tfidf__use_idf': ['True','False'],
        'rf__n_estimators': [1000,2000,2500],
        'rf__max_features': ["sqrt","log2"],
        'rf__max_depth':  [50,70],
        'rf__min_samples_split': [20,30],
        'rf__min_samples_leaf': [2, 3],
        'rf__bootstrap': [True]
        
    }
]

In [18]:
grid_rf = GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, scoring = "accuracy",verbose=10)

In [None]:
grid_rf.fit(X_train, y_train)

In [103]:
print_results(grid_rf)

best score:  0.8505747126436781
best parameters:  {'rf__n_estimators': 2000, 'tfidf__norm': 'l2', 'rf__bootstrap': True, 'vectorizer__min_df': 3, 'vectorizer__ngram_range': (1, 3), 'rf__min_samples_leaf': 2, 'rf__max_features': 'log2', 'rf__max_depth': 50, 'rf__min_samples_split': 20}


best score:  0.8505747126436781
best parameters:  {'rf__n_estimators': 2000, 'tfidf__norm': 'l2', 'rf__bootstrap': True, 'vectorizer__min_df': 3, 
        'vectorizer__ngram_range': (1, 3), 'rf__min_samples_leaf': 2, 'rf__max_features': 'log2', 
                   'rf__max_depth': 50, 'rf__min_samples_split': 20}

### Logistic Regression

In [83]:
estimators = [('vectorizer', CountVectorizer()), 
              ('tfidf',TfidfTransformer()),
#                ('lda', LDA(learning_method='batch',random_state=rand_state)), 
               ('logit', LogisticRegression())]

In [84]:
pipe = Pipeline(memory=None,steps = estimators)

In [85]:
param_grid = [
    {
        'vectorizer__ngram_range': [(1,5),(1,4),(1,3)],
        'vectorizer__min_df': [3,4],
        'tfidf__norm': ['l1','l2'],
        'tfidf__use_idf': ['True','False'],
#         'lda__n_components': [8,10,12],
        'logit__penalty': ['l1', 'l2'], 
        'logit__C': [0.1,1,10]      
    }
]

In [86]:
grid_logit = GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, scoring = "accuracy", verbose=10)

In [None]:
grid_logit.fit(X_train, y_train)

In [88]:
print_results(grid_logit)

best score:  0.8880889225716811
best parameters:  {'vectorizer__ngram_range': (1, 3), 'vectorizer__min_df': 3, 'tfidf__norm': 'l2', 'logit__penalty': 'l2', 'logit__C': 10}


### SVM-SVC

In [20]:
estimators = [('vectorizer', CountVectorizer()), 
              ('tfidf',TfidfTransformer()),
#                ('lda', LDA(learning_method='batch',random_state=rand_state)), 
               ('svc', svm.SVC())]

pipe = Pipeline(memory=None,steps = estimators)

In [24]:
param_grid = [
    {
        'vectorizer__ngram_range': [(1,2),(1,3),(1,4)],
        'vectorizer__min_df': [2,3],
        'tfidf__norm': ['l1','l2'],
        'tfidf__use_idf': ['True','False'],
#         'lda__n_components': [4,7,10],
        'svc__C': [0.1,1,10],
        'svc__kernel': ['rbf'],
        'svc__degree': [2,4,6],
        'svc__gamma': [0.5, 1]
    
    }
]

In [25]:
grid_svc = GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, scoring = "accuracy",verbose=10)

In [26]:
grid_svc.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 2) 
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 2) 
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 2) 
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 2) 
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 2) 
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3) 
[

[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   25.7s


[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3), score=0.8149084017687934, total=  20.7s
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 2), score=0.815540113708149, total=  18.8s
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3), score=0.46430827542640557, total=  20.1s
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__us

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   30.8s


[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3), score=0.4810606060606061, total=  22.7s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3) 
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4), score=0.6614024005053696, total=  20.0s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_r

[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:   32.8s


[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 2), score=0.8325963360707518, total=  20.1s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3), score=0.6708780795957043, total=  22.3s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_r

[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:   44.6s


[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 2), score=0.8154235145385588, total=  20.4s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3), score=0.8168035375868604, total=  20.3s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3), score=0.8084702907711757, total=  17.8s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=

[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:  1.0min


[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4), score=0.8035375868603917, total=  23.5s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 2), score=0.7231352718078382, total=  20.5s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4) 
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3), score=0.7161820480404552, total=  20.1s
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=Tru

[Parallel(n_jobs=-1)]: Done  85 tasks      | elapsed:  1.1min


[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4), score=0.713653603034134, total=  27.9s
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4) 
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4), score=0.7176247631080227, total=  23.7s
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4) 
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 2), score=0.713653603034134, total=  26.9s
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=Tr

[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:  1.5min


[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3), score=0.7915350600126342, total=  25.7s
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 2) 
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4), score=0.7909033480732786, total=  23.7s
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4), score=0.7964601769911505, total=  25.1s
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=

[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed:  1.6min


[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 3), score=0.4674668351231838, total=  23.2s
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 2), score=0.6227129337539432, total=  27.3s
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__n

[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.9min


[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4), score=0.6696146557169931, total=  21.5s
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4), score=0.6649810366624526, total=  21.3s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4) 
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 2) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4), score=0.6496212121212122, total=  22.5s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=

[Parallel(n_jobs=-1)]: Done 169 tasks      | elapsed:  2.1min


[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 2), score=0.807570977917981, total=  22.4s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4), score=0.7854889589905363, total=  27.3s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4), score=0.7927984838913456, total=  25.2s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=T

[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.4min


[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 2), score=0.6952681388012618, total=  19.4s
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2) 
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4), score=0.7037271004421983, total=  25.5s
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2) 
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4), score=0.6883280757097792, total=  29.7s
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=

[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 3), score=0.8232323232323232, total=  22.5s
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3) 


[Parallel(n_jobs=-1)]: Done 217 tasks      | elapsed:  2.6min


[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 2), score=0.8294377763739734, total=  22.7s
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4) 
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4), score=0.7789008212255212, total=  19.7s
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4) 
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 3), score=0.8059418457648546, total=  22.0s
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=T

[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2) 
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4), score=0.7983565107458913, total=  24.4s
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2) 


[Parallel(n_jobs=-1)]: Done 242 tasks      | elapsed:  2.9min


[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3), score=0.46430827542640557, total=  18.4s
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4), score=0.450063211125158, total=  16.9s
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3), score=0.4810606060606061, total=  22.3s
[CV] svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__us

[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3), score=0.6683512318382817, total=  23.6s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4), score=0.46464646464646464, total=  28.0s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_

[Parallel(n_jobs=-1)]: Done 269 tasks      | elapsed:  3.2min


[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3), score=0.8207070707070707, total=  19.3s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4), score=0.6614024005053696, total=  23.8s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4), score=0.6496212121212122, total=  23.2s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=

[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4), score=0.8035375868603917, total=  18.2s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4) 
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2), score=0.8275426405559065, total=  23.2s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4) 


[Parallel(n_jobs=-1)]: Done 296 tasks      | elapsed:  3.5min


[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3), score=0.8079595704358813, total=  23.5s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4) 
[CV]  svc__C=0.1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4), score=0.8018927444794953, total=  22.8s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2) 
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2), score=0.7049905243209097, total=  20.1s
[CV] svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=Tr

[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4), score=0.7176247631080227, total=  19.9s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3) 
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4), score=0.711489898989899, total=  20.5s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1

[Parallel(n_jobs=-1)]: Done 325 tasks      | elapsed:  3.8min


[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4) 
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3), score=0.7024636765634871, total=  23.5s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4) 
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3), score=0.7161820480404552, total=  23.4s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 2) 
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(

[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3) 
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3), score=0.8232323232323232, total=  27.3s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3), score=0.7929292929292929, total=  21.1s
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(

[Parallel(n_jobs=-1)]: Done 354 tasks      | elapsed:  4.1min


[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 2), score=0.8704990524320909, total=  16.3s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4) 
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4), score=0.7909033480732786, total=  24.8s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4) 
[CV]  svc__C=0.1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4), score=0.8041692987997473, total=  18.5s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, 

[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4), score=0.8482932996207333, total=  18.8s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3), score=0.8737373737373737, total=  13.8s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3),

[Parallel(n_jobs=-1)]: Done 385 tasks      | elapsed:  4.3min


[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4), score=0.8618296529968454, total=  15.8s
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4), score=0.8660770688566014, total=  15.7s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4) 
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4) 
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4), score=0.8724747474747475, total=  15.5s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__d

[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3), score=0.8964646464646465, total=  13.2s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 2), score=0.8801261829652997, total=  15.0s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 2), score=0.8856601389766267, total=  14.1s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__d

[Parallel(n_jobs=-1)]: Done 416 tasks      | elapsed:  4.5min


[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 2), score=0.8818698673404928, total=  10.1s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4) 
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4), score=0.9027163613392293, total=  18.7s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 2) 
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4), score=0.8964646464646465, total=  14.9s
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__

[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4), score=0.8719242902208202, total=  11.5s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 3) 
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4), score=0.8819444444444444, total=  17.6s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4)

[Parallel(n_jobs=-1)]: Done 449 tasks      | elapsed:  4.8min


[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4), score=0.8787113076437144, total=  14.9s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 2) 
[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 2), score=0.9015151515151515, total=  12.8s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 2) 
[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4), score=0.8902208201892744, total=  16.2s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__d

[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4), score=0.886435331230284, total=  16.8s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4) 
[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 4), score=0.901452937460518, total=  15.1s
[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 2), score=0.9027163613392293, total=  17.5s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4) 
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__deg

[Parallel(n_jobs=-1)]: Done 482 tasks      | elapsed:  5.0min


[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=2, vectorizer__ngram_range=(1, 3), score=0.9020846493998737, total=  18.1s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 2), score=0.861198738170347, total=  12.1s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 2), score=0.8717624763108023, total=   8.4s
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, sv

[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 2), score=0.8820189274447949, total=  13.5s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4) 
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 3), score=0.8711307643714467, total=  15.2s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 2) 
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4), score=0.8618296529968454, total=  16.4s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__d

[Parallel(n_jobs=-1)]: Done 517 tasks      | elapsed:  5.3min


[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 3), score=0.9002525252525253, total=  16.3s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4) 
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4), score=0.8813880126182966, total=  10.3s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4) 
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 2), score=0.8801261829652997, total=  14.4s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__d

[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 2), score=0.865615141955836, total=  14.0s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 2), score=0.8742893240682249, total=  16.8s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 3), score=0.8761844598862919, total=  13.5s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4

[Parallel(n_jobs=-1)]: Done 552 tasks      | elapsed:  5.6min


[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4), score=0.8787113076437144, total=  12.1s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2) 
[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 3), score=0.8888888888888888, total=  13.7s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2) 
[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4), score=0.8749210360075805, total=  16.8s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__d

[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 2), score=0.9027163613392293, total=  18.0s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 2), score=0.8912768647281921, total=  17.9s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4) 
[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 4), score=0.8902208201892744, total=  23.0s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__d

[Parallel(n_jobs=-1)]: Done 589 tasks      | elapsed:  5.9min


[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 3), score=0.8906447534766119, total=  15.9s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 3), score=0.903348073278585, total=  26.0s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=4, vectorizer__ngram_range=(1, 2), score=0.9015151515151515, total=  24.6s
[CV] svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__de

[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2) 
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4), score=0.8630050505050505, total=  19.9s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2) 
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2), score=0.8660770688566014, total=  16.6s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2) 
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4),

[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3), score=0.865992414664981, total=  18.3s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4) 
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2), score=0.8906447534766119, total=  16.4s
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2), score=0.9020846493998737, total=  16.6s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__de

[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2) 
[CV]  svc__C=1, svc__gamma=0.5, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4), score=0.9046114971572963, total=  15.8s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2) 
[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2), score=0.8742893240682249, total=  15.0s
[CV] svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3) 
[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l1, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3), sco

[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3), score=0.8883280757097792, total=  19.2s
[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3), score=0.898989898989899, total=  18.7s
[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3), score=0.8926089703095389, total=  17.8s
[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3), score=0.8893805309734514, total=  17.5s


[Parallel(n_jobs=-1)]: Done 698 out of 720 | elapsed:  6.8min remaining:   12.8s


[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 3), score=0.903348073278585, total=  17.7s
[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2), score=0.8912768647281921, total=  10.0s
[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2), score=0.8858044164037855, total=  12.8s
[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=2, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 4), score=0.8902208201892744, total=  14.6s
[CV]  svc__C=1, svc__gamma=1, svc__kernel=rbf, tfidf__norm=l2, vectorizer__min_df=3, tfidf__use_idf=True, svc__degree=6, vectorizer__ngram_range=(1, 2), score=0.9015151515151515, total=  11.7s
[CV]  svc__C=1, svc__gamma=1, svc__k

[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:  6.8min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'svc__C': [0.1, 1], 'tfidf__norm': ['l1', 'l2'], 'svc__degree': [2, 4, 6], 'svc__kernel': ['rbf'], 'vectorizer__ngram_range': [(1, 2), (1, 3), (1, 4)], 'svc__gamma': [0.5, 1], 'tfidf__use_idf': ['True'], 'vectorizer__min_df': [2, 3]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=10)

In [27]:
print_results(grid_svc)

best score:  0.8947833775419982
best parameters:  {'svc__C': 1, 'svc__gamma': 1, 'svc__kernel': 'rbf', 'tfidf__norm': 'l2', 'vectorizer__ngram_range': (1, 2), 'vectorizer__min_df': 3, 'svc__degree': 2, 'tfidf__use_idf': 'True'}


### Stochastic Gradient Descent

In [138]:
estimators = [('vectorizer', CountVectorizer()), 
              ('tfidf',TfidfTransformer()),
#                ('lda', LDA(learning_method='batch',random_state=rand_state)), 
               ('sgd', SGDClassifier())]

pipe = Pipeline(memory=None,steps = estimators)

In [148]:
param_grid = [
    {
        'vectorizer__ngram_range': [(1,1),(1,2),(1,3)],
        'vectorizer__min_df': [3,4],
        'tfidf__norm': ['l1','l2'],
        'sgd__loss': ['hinge', 'log'],
        'sgd__penalty': ['l1','l2','elasticnet'],
        'sgd__alpha': [0.0001, 0.001, 0.01, 0.1]
    
    }
]

In [150]:
grid_sgd = GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, scoring = "accuracy",verbose=10)

In [None]:
grid_sgd.fit(X_train, y_train)

In [152]:
print_results(grid_sgd)

best score:  0.8890994063407857
best parameters:  {'tfidf__norm': 'l2', 'sgd__alpha': 0.0001, 'vectorizer__min_df': 3, 'vectorizer__ngram_range': (1, 3), 'sgd__loss': 'hinge', 'sgd__penalty': 'l2'}


### KNN

In [223]:
estimators = [('vectorizer', CountVectorizer()), 
              ('tfidf',TfidfTransformer()),
#                ('lda', LDA(learning_method='batch',random_state=rand_state)), 
               ('knn', KNeighborsClassifier())]

pipe = Pipeline(memory=None,steps = estimators)

In [224]:
param_grid = [
    {
        'vectorizer__ngram_range': [(1,2),(1,3)],
        'vectorizer__min_df': [2,3],
        'tfidf__norm': ['l1','l2'],
        'knn__n_neighbors': [10,25,40],
        'knn__weights': ['uniform','distance'],
        'knn__algorithm':['brute'],
#         'knn__leaf_size':[10,30,50],
        'knn__p':[1,2]
    }
]

In [225]:
grid_knn = GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, scoring = "accuracy",verbose=10)

In [None]:
grid_knn.fit(X_train, y_train)

In [186]:
print_results(grid_knn)

best score:  0.8556271314892004
best parameters:  {'tfidf__norm': 'l2', 'knn__weights': 'distance', 'knn__n_neighbors': 25, 'vectorizer__ngram_range': (1, 2), 'vectorizer__min_df': 3, 'knn__algorithm': 'ball_tree'}


### Naive Bayes

In [54]:
estimators = [('vectorizer', CountVectorizer()), 
              ('tfidf',TfidfTransformer()),
#                ('lda', LDA(learning_method='batch',random_state=rand_state)), 
               ('mnb', MultinomialNB())]

pipe = Pipeline(memory=None,steps = estimators)

In [55]:
param_grid = [
    {
        'vectorizer__ngram_range': [(1,2),(1,3),(1,4)],
        'vectorizer__min_df': [1,2,3],
        'tfidf__norm': ['l1','l2'],
        'tfidf__use_idf': ['True','False'],
#         'lda__n_components': [4,7,10],
        'mnb__alpha': [0.1,0.5,1,10],
        'mnb__fit_prior': [True, False]
    
    }
]

In [56]:
grid_mnb = GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, scoring = "accuracy", verbose=10)

In [None]:
grid_mnb.fit(X_train, y_train)

In [222]:
print_results(grid_mnb)

best score:  0.8769736011115321
best parameters:  {'vectorizer__ngram_range': (1, 4), 'tfidf__norm': 'l2', 'mnb__alpha': 0.5, 'mnb__fit_prior': False, 'vectorizer__min_df': 1, 'tfidf__use_idf': 'True'}


### XGBoost

In [63]:
from xgboost import XGBClassifier

In [72]:
estimators = [('vectorizer', CountVectorizer()), 
              ('tfidf',TfidfTransformer()),
#                ('lda', LDA(learning_method='batch',random_state=rand_state)), 
               ('xgb', XGBClassifier(n_jobs=22, objective='multi:softmax', eval_metric = 'merror'))]
pipe = Pipeline(memory=None, steps = estimators)

In [73]:
param_grid = [
    {
#         'vectorizer__ngram_range': [(1,2),(1,3)],
#         'vectorizer__min_df': [2, 3, 4],
#         'tfidf__norm': ['l1','l2'],
#         'tfidf__use_idf': ['True','False'],
        'xgb__max_depth': [8,9,10],
        'xgb__learning_rate': [0.1],
        'xgb__n_estimators':[300,350,400],
        'xgb__gamma': [0.1],
        'xgb__subsample': [0.5,0.6]
    
    }
]

In [74]:
grid_xgb= GridSearchCV(pipe, cv=5, n_jobs=-1, param_grid=param_grid, scoring = "accuracy", verbose=10)

In [None]:
grid_xgb.fit(X_train, y_train)

In [46]:
print_results(grid_xgb)

best score:  0.8676266262473159
best parameters:  {'xgb__booster': 'gbtree', 'xgb__gamma': 0.1, 'xgb__n_estimators': 300, 'xgb__max_depth': 8, 'xgb__learning_rate': 0.1}


# Prediction 

In [31]:
df2 = pd.read_csv("test1.csv",index_col = 0)
df2.head()

Unnamed: 0_level_0,TITLE,TOPIC
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,INDONESIAN COFFEE PRODUCTION MAY FALL THIS YEAR,
1,INTERNATIONAL BUSINESS MACHINES CORP <IBM> NET,
2,WESTERN TELE-COMMUNICATIONS <WTLCA> 4TH QTR NET,
3,NACCO INDUSTRIES <NC> TO REPORT 2ND QTR GAIN,
4,WALKER TELECOMMUNICATIONS CORP <WTEL> 4TH QTR,


In [32]:
X_pred = [clean(row) for row in df2["TITLE"]]

In [28]:
estimators = [('vectorizer', CountVectorizer(ngram_range=(1,2),min_df=3)), 
              ('tfidf',TfidfTransformer(use_idf=True, norm='l2')),
               ('svc', svm.SVC(C=1, gamma=1, kernel ='rbf',degree=2))]

pipe = Pipeline(memory=None,steps = estimators)

In [29]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
       ...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [30]:
pipe.score(X_train, y_train)

0.9820639130983958

In [34]:
prediction = pipe.predict(X_pred)

In [37]:
df2["TOPIC"] = prediction

In [39]:
df2.groupby("TOPIC").count()

Unnamed: 0_level_0,TITLE
TOPIC,Unnamed: 1_level_1
0,1203
1,1095
2,1095


In [None]:
df.to_csv()