# Code Assignment 1 Text Mining

The following code was used to perform the analyses from assignment one. First, nine different models were tested using the 20newsgroups data. Next, the best performing model was used to investigate different parameters for the CountVectorizer function. 

In [1]:
import sklearn as sk
import numpy as np
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [2]:
# Load train and test data
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)

In [3]:
# Define 9 models
RF = RandomForestClassifier()
SV = SVC()

text_clf1 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf2 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf = False)),
    ('clf', MultinomialNB()),
])

text_clf3 = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB()),
])

text_clf4 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RF),
])

text_clf5 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf = False)),
    ('clf', RF),
])

text_clf6 = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', RF),
])

text_clf7 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SV),
])
text_clf8 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf = False)),
    ('clf', SV),
])

text_clf9 = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', SV),
])

model_list = [text_clf1, text_clf2, text_clf3, text_clf4, text_clf5, text_clf6, text_clf7, text_clf8, text_clf9]

In [7]:
# Calculate performance statistics for each model
accuracy = []
f1 = []
precision = []
recall = []

for text_clf in model_list:
    
    text_clf.fit(twenty_train.data, twenty_train.target)
    docs_test = twenty_test.data
    predicted = text_clf.predict(docs_test)
    
    acc = np.mean(predicted == twenty_test.target)
    prec = metrics.precision_score(twenty_test.target, predicted, average = 'macro', zero_division = 0)
    rec = metrics.recall_score(twenty_test.target, predicted, average = 'macro', zero_division = 0)
    f = metrics.f1_score(twenty_test.target, predicted, average = 'macro', zero_division = 0)
    
    accuracy.append(acc)
    f1.append(f)
    precision.append(prec)
    recall.append(rec)
    
    print('Model: ', text_clf, 
          '\nAccuracy: ', acc, 
          '\nF1: ', f,
          '\nPrecision: ', prec,
          '\nRecall: ', rec, '\n')

Model:  Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())]) 
Accuracy:  0.7738980350504514 
F1:  0.7557542971333199 
Precision:  0.8255310124210137 
Recall:  0.756525006352595 

Model:  Pipeline(steps=[('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer(use_idf=False)),
                ('clf', MultinomialNB())]) 
Accuracy:  0.7052575677110993 
F1:  0.6727826639341477 
Precision:  0.7924314057319584 
Recall:  0.6821951093902918 

Model:  Pipeline(steps=[('vect', CountVectorizer()), ('clf', MultinomialNB())]) 
Accuracy:  0.7728359001593202 
F1:  0.745098233005215 
Precision:  0.7621626411174734 
Recall:  0.7636463041415988 

Model:  Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier())]) 
Accuracy:  0.7608868826340945 
F1:  0.7474646528349933 
Precision:  0.7736778511628659 
Recall:  0.7491164438873836 

Model:  Pipeline(steps=[('v

From these results, we can see that model 7 performed the best. Using this model, we will try different parameters for the CountVectorizer function.

In [4]:
# Define models with different parameters
text_clf_params1 = Pipeline([
    ('vect', CountVectorizer(lowercase=False)),
    ('tfidf', TfidfTransformer()),
    ('clf', SV),
])

text_clf_params2 = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('clf', SV),
])

text_clf_params3 = Pipeline([
    ('vect', CountVectorizer(stop_words='english', ngram_range=(1,2), analyzer='word')),
    ('tfidf', TfidfTransformer()),
    ('clf', SV),
])

text_clf_params4 = Pipeline([
    ('vect', CountVectorizer(stop_words='english', ngram_range=(1,5), analyzer='word')),
    ('tfidf', TfidfTransformer()),
    ('clf', SV),
])

text_clf_params5 = Pipeline([
    ('vect', CountVectorizer(stop_words='english',max_features=10000)),
    ('tfidf', TfidfTransformer()),
    ('clf', SV),
])

text_clf_params6 = Pipeline([
    ('vect', CountVectorizer(stop_words='english',max_features=100000)),
    ('tfidf', TfidfTransformer()),
    ('clf', SV),
])

text_clf_params7 = Pipeline([
    ('vect', CountVectorizer(stop_words='english',max_features=50000)),
    ('tfidf', TfidfTransformer()),
    ('clf', SV),
])

# Calculate performance statistics of models with different parameters
model_list_params = [text_clf_params1, text_clf_params2, text_clf_params3, text_clf_params4, text_clf_params5, text_clf_params6, text_clf_params7]

accuracy = []
f1 = []
precision = []
recall = []

for text_clf_params in model_list_params:

    text_clf_params.fit(twenty_train.data, twenty_train.target)
    docs_test = twenty_test.data
    predicted = text_clf_params.predict(docs_test)

    acc = np.mean(predicted == twenty_test.target)
    prec = metrics.precision_score(twenty_test.target, predicted, average = 'macro')
    rec = metrics.recall_score(twenty_test.target, predicted, average = 'macro')
    f = metrics.f1_score(twenty_test.target, predicted, average = 'macro')

    accuracy.append(acc)
    f1.append(f)
    precision.append(prec)
    recall.append(rec)

    print('Model: ', text_clf_params,
          '\nAccuracy: ', acc,
          '\nF1: ', f,
          '\nPrecision: ', prec,
          '\nRecall: ', rec, '\n')

Model:  Pipeline(steps=[('vect', CountVectorizer(lowercase=False)),
                ('tfidf', TfidfTransformer()), ('clf', SVC())]) 
Accuracy:  0.8081518852894317 
F1:  0.8043136314083984 
Precision:  0.8233577678553944 
Recall:  0.7996708177363485 

Model:  Pipeline(steps=[('vect', CountVectorizer(stop_words='english')),
                ('tfidf', TfidfTransformer()), ('clf', SVC())]) 
Accuracy:  0.8252788104089219 
F1:  0.8204625116789279 
Precision:  0.8354113241937207 
Recall:  0.8166153362210606 

Model:  Pipeline(steps=[('vect',
                 CountVectorizer(ngram_range=(1, 2), stop_words='english')),
                ('tfidf', TfidfTransformer()), ('clf', SVC())]) 
Accuracy:  0.8143919277748274 
F1:  0.8101391685030425 
Precision:  0.8271853993368288 
Recall:  0.8060083626135501 

Model:  Pipeline(steps=[('vect',
                 CountVectorizer(ngram_range=(1, 5), stop_words='english')),
                ('tfidf', TfidfTransformer()), ('clf', SVC())]) 
Accuracy:  0.793547530536

From these performance scores, we can see that setting the max_features parameter to 50000 and the stop_words to 'english' yields the best results.