In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import chi2, SelectKBest, f_classif, f_regression
from sklearn import model_selection
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline


In [5]:
from Scripts import loading as dl
amazon_link = '../Data/amazon_phone.pkl'
df = dl.load_sampled(amazon_link, 10000)

target = df.label
text = df.text

X_train, X_test, y_train, y_test = train_test_split(text, target, test_size=0.25,random_state=None)
# # unigrams
# vect = CountVectorizer()


10000 reviews per class from ../Data/amazon_phone.pkl loaded


In [None]:
tfidf = TfidfVectorizer()
param_grid = [{'vect__ngram_range': [(1,2), (1,3)],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [ 0.1, 1.0, 10.0],
               'clf__multi_class': ['ovr', 'multinomial']
               }]
lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy',
                           cv=5, verbose=1, n_jobs=-1)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  8.5min


In [13]:
gs_lr_tfidf.best_params_

{'clf__C': 1.0,
 'clf__multi_class': 'ovr',
 'clf__penalty': 'l2',
 'vect__ngram_range': (1, 1)}

In [None]:
# feature occurence
vect = CountVectorizer(ngram_range=(1,3), binary=False)
# feature occurence
count = CountVectorizer(ngram_range=(1,3), binary=False)

In [None]:
using text_prep

features
unigram (),
bigram
trigram
uni + bi
bi + tri
binary 1-3
count 1-3
tfidf 1-3

max 80, 90
min 2, 5
vocabulary size

In [None]:
models = []
models.append(('MNB', MultinomialNB()))
models.append(('LR', LogisticRegression(dual=False, random_state=10, multi_class='multinomial')))
models.append(('SDG', SGDClassifier()))
models.append(('SVM', SVC()))
results = []
names = []
# scoring = 'f1_macro'
scoring = 'accuracy'
vectorizer = CountVectorizer(min_df=2 ,max_df=0.8, ngram_range=(1,3))
text_vectorized = vectorizer.fit_transform(text)
# vectorizer_chi2 = SelectKBest(chi2,k=3000)
vectorizer_chi2 = SelectKBest(score_func=f_classif, k=3500)
chi_text_vectorized = vectorizer_chi2.fit_transform(text_vectorized,target)
seed = 7
for name, model in models:
	kfold = model_selection.KFold(n_splits=5, random_state=seed)
	cv_results = model_selection.cross_val_score(model, chi_text_vectorized, target, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()