In [1]:
import sklearn as sk
import numpy as np
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [None]:
#https://medium.com/all-things-ai/in-depth-parameter-tuning-for-svc-758215394769
#https://towardsdatascience.com/random-forest-hyperparameters-and-how-to-fine-tune-them-17aee785ee0d#:~:text=The%20most%20important%20hyper%2Dparameters,MSE%20or%20MAE%20for%20regression)
#https://medium.com/@siyao_sui/nlp-with-the-20-newsgroups-dataset-ab35cd0ea902
#https://gist.github.com/SuyashLakhotia/f26d249d5cbb7a3784b64c20bea5a460
#https://towardsdatascience.com/svm-hyperparameters-explained-with-visualizations-143e48cb701b

In [7]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)

In [3]:
RF = RandomForestClassifier()#n_estimators = 150, max_features = 'sqrt')
SV = SVC()#C = 5, gamma = 0.75)

text_clf1 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf2 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf = False)),
    ('clf', MultinomialNB()),
])

text_clf3 = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB()),
])

text_clf4 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RF),
])

text_clf5 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf = False)),
    ('clf', RF),
])

text_clf6 = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', RF),
])

text_clf7 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SV),
])
text_clf8 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf = False)),
    ('clf', SV),
])

text_clf9 = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', SV),
])

model_list = [text_clf1, text_clf2, text_clf3, text_clf4, text_clf5, text_clf6, text_clf7, text_clf8, text_clf9]

In [None]:
accuracy = []
f1 = []
precision = []
recall = []

for text_clf in model_list:
    
    text_clf.fit(twenty_train.data, twenty_train.target)
    docs_test = twenty_test.data
    predicted = text_clf.predict(docs_test)
    
    acc = np.mean(predicted == twenty_test.target)
    prec = metrics.precision_score(twenty_test.target, predicted, average = 'macro')
    rec = metrics.recall_score(twenty_test.target, predicted, average = 'macro')
    f = metrics.f1_score(twenty_test.target, predicted, average = 'macro')
    
    accuracy.append(acc)
    f1.append(f)
    precision.append(prec)
    recall.append(rec)
    
    print('Model: ', text_clf, 
          '\nAccuracy: ', acc, 
          '\nF1: ', f,
          '\nPrecision: ', prec,
          '\nRecall: ', rec, '\n')

Model:  Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())]) 
Accuracy:  0.7738980350504514 
F1:  0.7557542971333199 
Precision:  0.8255310124210137 
Recall:  0.756525006352595 

Model:  Pipeline(steps=[('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer(use_idf=False)),
                ('clf', MultinomialNB())]) 
Accuracy:  0.7052575677110993 
F1:  0.6727826639341477 
Precision:  0.7924314057319584 
Recall:  0.6821951093902918 

Model:  Pipeline(steps=[('vect', CountVectorizer()), ('clf', MultinomialNB())]) 
Accuracy:  0.7728359001593202 
F1:  0.745098233005215 
Precision:  0.7621626411174734 
Recall:  0.7636463041415988 

Model:  Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier())]) 
Accuracy:  0.7642060541688794 
F1:  0.750452273998716 
Precision:  0.7733809556191514 
Recall:  0.7519407104952398 

Model:  Pipeline(steps=[('ve

In [6]:
text_clf_params = Pipeline([
    ('vect', CountVectorizer(stop_words='english',max_features=50000)),
    ('tfidf', TfidfTransformer()),
    ('clf', SV),
])

text_clf_params.fit(twenty_train.data, twenty_train.target)
docs_test = twenty_test.data
predicted = text_clf_params.predict(docs_test)

acc = np.mean(predicted == twenty_test.target)
prec = metrics.precision_score(twenty_test.target, predicted, average = 'macro')
rec = metrics.recall_score(twenty_test.target, predicted, average = 'macro')
f = metrics.f1_score(twenty_test.target, predicted, average = 'macro')

print('Model: ', text_clf_params,
      '\nAccuracy: ', acc,
      '\nF1: ', f,
      '\nPrecision: ', prec,
      '\nRecall: ', rec, '\n')

SyntaxError: invalid syntax (587259541.py, line 7)