In [1]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix

In [2]:
# parameters
sample_size = 36000

features = {
    'd_chall_score': True, 
    'aoa_mean': False, 
    'aoa_min': False, 
    'aoa_max': True,
    'conc_rating_mean': False, 
    'conc_rating_min': False, 
    'conc_rating_max': True, 
    'num_lemmas': True,
    'tf-idf': False
}

In [3]:
train = pd.read_csv('assets/Training_set.csv', delimiter='\t', index_col=0)
test = pd.read_csv('assets/Testing_set.csv', delimiter='\t', index_col=0)

num_features = [k for k,v in features.items() if v and k != 'tf-idf']
txt_features = 'lemmatized_text'

In [4]:
num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler()),
    ('binner', KBinsDiscretizer(n_bins=4)),
])

text_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, num_features),
    ('text', text_pipe, txt_features),
])

clf = Pipeline([
    ('preprocessor', preprocessor),
    ('svc', SVC())
])

In [5]:

n = len(train) if sample_size == -1 else sample_size

sample_df = train.groupby('label').apply(lambda grp: grp.sample(int(n/2), random_state=99))

In [6]:
tfidf_prefix = 'preprocessor__text__tfidf__'
svc_prefix = 'svc__'

param_grid = {tfidf_prefix + 'min_df':[0],
              tfidf_prefix + 'max_df':[len(sample_df)],
              tfidf_prefix + 'ngram_range':[(1, 3)],
              svc_prefix + 'C': [0.1, 1, 10, 100], 
              svc_prefix + 'gamma': [1, 0.1, 0.01, 0.001],
              svc_prefix + 'kernel':['rbf'],
              svc_prefix + 'random_state':[99]}

grid = (GridSearchCV(clf, 
                    param_grid=param_grid,
                    scoring='accuracy', 
                    cv=StratifiedKFold(n_splits=10),
                    refit=False,
                    verbose=3,
                    return_train_score=True,
                    n_jobs=-1)
        .fit(sample_df, sample_df['label']))

Fitting 10 folds for each of 16 candidates, totalling 160 fits


: 

In [71]:


if features['tf-idf']:
    tfidf = TfidfVectorizer(min_df=25, stop_words='english', ngram_range=(1,3))
    X_train_vec = tfidf.fit_transform(X_train['lemmatized_text'])
    X_test_vec = tfidf.transform(X_test['lemmatized_text'])

    X_train_mads = csr_matrix(X_train.iloc[:, 1:-1])
    X_test_mads = csr_matrix(X_test.iloc[:, 1:-1])

    X_train = hstack((X_train_vec, X_train_mads))
    X_test = hstack((X_test_vec, X_test_mads))

try:
    del X_train['lemmatized_text'] 
    del X_test['lemmatized_text']
except:
    pass

In [72]:
isinstance(train, pd.DataFrame)

True

In [73]:
pipe = Pipeline([('scale', StandardScaler(with_mean=False)), ('impute', SimpleImputer()), ('svc', SVC())])


# svc = SVC()
# svc.fit(X_train_all, y_train)
# pred = svc.predict(X_test_all)
# classification_report(y_test, pred)
pipe.fit(X_train_vec, y_train)
pipe.score(X_test_vec, y_test)


0.5018599590049851

In [74]:
import pickle

s = pickle.dumps(pipe['svc'])