# Group 27

# Text Classification

# Dataset-1 

fetch_20newgroups


multiclassification

# Load the datasets

In [776]:
#Working with the first data set

from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), random_state=21)

newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), random_state=21)

len(newsgroups_train.data), len(newsgroups_test.data)

(11314, 7532)

In [647]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

# Data Splitting

In [777]:
from sklearn.model_selection import train_test_split
import re, string

x_train, x_val, y_train, y_val = train_test_split(newsgroups_train.data, newsgroups_train.target, train_size=0.8, test_size=0.2)
print(len(x_train))

x_test = newsgroups_test.data
y_test = newsgroups_test.target

9051


# Preprocessing

In [780]:
def preprocessing(file):
    text_content = []
    exclude = string.punctuation
    exclude = exclude.replace("-", "")
    pattern = r"[{}]".format(exclude)

    for data in file :
        text = re.sub(r"(<br\s*/><br\s*/>)", " ", str(data))
        text = re.sub(pattern, "", str(text))
        text_content.append(text.lower())
    return text_content

In [779]:
x_train = preprocessing(x_train)
x_val = preprocessing(x_val)
y_train = preprocessing(y_train)
y_val = preprocessing(y_val)

x_test = preprocessing(x_test)
y_test = preprocessing(y_test)
len(x_test), len(y_test)

(7532, 7532)

# Vectorization

In [782]:
#CountVectorizer (Not Necessary)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer='word', stop_words=None, ngram_range=(1,1), binary=True)
#print(vectorizer)

train_set = vectorizer.fit_transform(x_train)
val_set = vectorizer.transform(x_val)
test_set = vectorizer.transform(x_test)

print(train_set.shape)
print(test_set.shape)

(9051, 99866)
(7532, 99866)


In [784]:
#TFIDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_vectorizer = TfidfVectorizer()
train_idf = tf_idf_vectorizer.fit_transform(x_train)
val_idf = tf_idf_vectorizer.transform(x_val)
test_idf = tf_idf_vectorizer.transform(x_test)

print(train_idf.shape)
print(val_idf.shape)
print(test_idf.shape)

(9051, 99866)
(2263, 99866)
(7532, 99866)


In [786]:
#Normalization
from sklearn.preprocessing import Normalizer

normalizer_train = Normalizer().fit(X=vectors_train_idf)
train_norm = normalizer_train.transform(train_idf)
val_norm = normalizer_train.transform(val_idf)
test_norm = normalizer_train.transform(test_idf)

print(train_norm.shape)
print(val_norm.shape)
print(test_norm.shape)

(9051, 99866)
(2263, 99866)
(7532, 99866)


# GridSearch for Logistic Regression

In [791]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import numpy as np
import time

clf = LogisticRegression(multi_class = 'multinomial', solver='saga', max_iter=4000)

tuned_parameters={"C":np.logspace(-3,3,7)}

scores = ['precision', 'recall']

#Tried for the following as well for parameters, L2 was better in this case
#"penalty":["l1","l2"]}#, "dual": ["True", "False"], "max_iter": np.power(10.0, np.arange(-10, 10))}

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf_log=GridSearchCV(clf,tuned_parameters,cv=10)
    clf_log.fit(train_norm, y_train)

    t0=time.time()
    print ("training time:", round(time.time()-t0, 3), "s")
    print("Best parameters set found on development set:")
    print()
    print(clf_log.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf_log.cv_results_['mean_test_score']
    stds = clf_log.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf_log.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    t1=time.time()
    y_pred = clf_log.predict(val_norm)
    print ("predict time:", round(time.time()-t1, 3), "s")
    print(metrics.classification_report(y_val, y_pred, digits = 5))

# Tuning hyper-parameters for precision

training time: 0.0 s
Best parameters set found on development set:

{'C': 100.0}

Grid scores on development set:

0.055 (+/-0.001) for {'C': 0.001}
0.359 (+/-0.017) for {'C': 0.01}
0.591 (+/-0.025) for {'C': 0.1}
0.712 (+/-0.012) for {'C': 1.0}
0.733 (+/-0.020) for {'C': 10.0}
0.733 (+/-0.022) for {'C': 100.0}
0.733 (+/-0.022) for {'C': 1000.0}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

predict time: 0.018 s
              precision    recall  f1-score   support

           0    0.69474   0.70213   0.69841        94
           1    0.68293   0.65116   0.66667       129
          10    0.86325   0.84167   0.85232       120
          11    0.86139   0.72500   0.78733       120
          12    0.71304   0.74545   0.72889       110
          13    0.86139   0.75652   0.80556       115
          14    0.79710   0.76923   0.78292       143
          15    0.7

In [792]:
print("tuned hyperparameters :(best parameters) ",clf_log.best_params_)
print("accurac y :",clf_log.best_score_)

tuned hyperparameters :(best parameters)  {'C': 100.0}
accurac y : 0.7333996243509004


In [804]:
#Optimal Solution
#Val set
clf = LogisticRegression(C=100,penalty="l2", multi_class = 'multinomial', solver='saga', max_iter=4000)
clf.fit(train_norm, y_train)
print ("training time:", round(time.time()-t0, 3), "s")
t2=time.time()
y_pred = clf.predict(val_norm)
print ("predict time:", round(time.time()-t1, 3), "s")
#print(metrics.classification_report(y_val, y_pred, digits = 5)

training time: 855.665 s
predict time: 855.685 s


In [807]:
print("accuracy:")
print(metrics.accuracy_score(y_val, y_pred))
print(metrics.classification_report(y_val, y_pred))

accuracy:
0.7211665930181176
              precision    recall  f1-score   support

           0       0.69      0.70      0.69        94
           1       0.68      0.65      0.67       129
          10       0.86      0.84      0.85       120
          11       0.86      0.72      0.79       120
          12       0.71      0.75      0.73       110
          13       0.84      0.76      0.80       115
          14       0.80      0.76      0.78       143
          15       0.76      0.77      0.76       146
          16       0.75      0.82      0.78        90
          17       0.75      0.77      0.76       114
          18       0.68      0.64      0.66        96
          19       0.50      0.36      0.42        80
           2       0.61      0.69      0.65        95
           3       0.68      0.61      0.64       107
           4       0.66      0.69      0.68        97
           5       0.79      0.74      0.76       124
           6       0.81      0.74      0.77       12

In [808]:
#Optimal Solution
#Test set
t2=time.time()
y_pred = clf.predict(test_norm)
print ("predict time for test:", round(time.time()-t2, 3), "s")
print("accuracy:")
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

predict time for test: 0.048 s
accuracy:
0.6623738714816781
              precision    recall  f1-score   support

           0       0.47      0.48      0.48       319
           1       0.64      0.66      0.65       389
          10       0.87      0.84      0.86       399
          11       0.79      0.68      0.73       396
          12       0.60      0.60      0.60       393
          13       0.73      0.73      0.73       396
          14       0.72      0.70      0.71       394
          15       0.65      0.72      0.68       398
          16       0.56      0.64      0.60       364
          17       0.79      0.69      0.74       376
          18       0.51      0.44      0.47       310
          19       0.41      0.35      0.38       251
           2       0.59      0.61      0.60       394
           3       0.61      0.61      0.61       392
           4       0.68      0.69      0.68       385
           5       0.81      0.63      0.71       395
           6       0.

# GridSearch for Multinomial Naive Bayes

In [810]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

n_folds = 10
tuned_parameters = {"alpha":np.logspace(-3,3,7), "fit_prior":["true","false"]}

scores = ['precision', 'recall']

clf2 = MultinomialNB()

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf_nb=GridSearchCV(clf2,tuned_parameters,cv=10)
    clf_nb.fit(train_norm, y_train)

    t0=time.time()
    print ("training time:", round(time.time()-t0, 3), "s")
    print("Best parameters set found on development set:")
    print()
    print(clf_nb.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf_nb.cv_results_['mean_test_score']
    stds = clf_nb.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf_nb.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    t1=time.time()
    y_pred = clf_nb.predict(val_norm)
    print ("predict time:", round(time.time()-t1, 3), "s")
    print(metrics.classification_report(y_val, y_pred, digits = 5))

print("tuned hyperparameters :(best parameters) ",clf_nb.best_params_)
print("accurac y :",clf_nb.best_score_)

# Tuning hyper-parameters for precision

training time: 0.0 s
Best parameters set found on development set:

{'alpha': 0.01, 'fit_prior': 'true'}

Grid scores on development set:

0.743 (+/-0.029) for {'alpha': 0.001, 'fit_prior': 'true'}
0.743 (+/-0.029) for {'alpha': 0.001, 'fit_prior': 'false'}
0.752 (+/-0.021) for {'alpha': 0.01, 'fit_prior': 'true'}
0.752 (+/-0.021) for {'alpha': 0.01, 'fit_prior': 'false'}
0.728 (+/-0.027) for {'alpha': 0.1, 'fit_prior': 'true'}
0.728 (+/-0.027) for {'alpha': 0.1, 'fit_prior': 'false'}
0.648 (+/-0.024) for {'alpha': 1.0, 'fit_prior': 'true'}
0.648 (+/-0.024) for {'alpha': 1.0, 'fit_prior': 'false'}
0.463 (+/-0.026) for {'alpha': 10.0, 'fit_prior': 'true'}
0.463 (+/-0.026) for {'alpha': 10.0, 'fit_prior': 'false'}
0.303 (+/-0.024) for {'alpha': 100.0, 'fit_prior': 'true'}
0.303 (+/-0.024) for {'alpha': 100.0, 'fit_prior': 'false'}
0.055 (+/-0.001) for {'alpha': 1000.0, 'fit_prior': 'true'}
0.055 (+/-0.001) for {'alpha': 1000.0, 'fit_prior': 'false

In [813]:
#Optimal Solution
#Validation set
MultinomialNB(alpha="0.01")
clf_nb.fit(train_norm, y_train)
print ("training time:", round(time.time()-t0, 3), "s")
t2=time.time()
y_pred2 = clf_nb.predict(val_norm)
print ("predict time:", round(time.time()-t1, 3), "s")
print("accuracy:")
print(metrics.accuracy_score(y_val, y_pred2))
print(metrics.classification_report(y_val, y_pred2))

training time: 459.855 s
predict time: 459.867 s
accuracy:
0.755634114007954
              precision    recall  f1-score   support

           0       0.71      0.64      0.67        94
           1       0.69      0.64      0.67       129
          10       0.91      0.92      0.91       120
          11       0.76      0.81      0.78       120
          12       0.80      0.75      0.77       110
          13       0.87      0.84      0.86       115
          14       0.88      0.79      0.83       143
          15       0.72      0.92      0.80       146
          16       0.66      0.87      0.75        90
          17       0.79      0.86      0.82       114
          18       0.72      0.71      0.72        96
          19       0.81      0.26      0.40        80
           2       0.42      0.62      0.50        95
           3       0.61      0.70      0.66       107
           4       0.72      0.75      0.74        97
           5       0.80      0.83      0.82       124
    

In [814]:
#Optimal Solution
#Test set
t2=time.time()
y_pred2 = clf_nb.predict(test_norm)
print ("predict time for test:", round(time.time()-t2, 3), "s")
print("accuracy:")
print(metrics.accuracy_score(y_test, y_pred2))
print(metrics.classification_report(y_test, y_pred2))

predict time for test: 0.041 s
accuracy:
0.688396176314392
              precision    recall  f1-score   support

           0       0.54      0.44      0.48       319
           1       0.66      0.69      0.67       389
          10       0.92      0.90      0.91       399
          11       0.68      0.75      0.71       396
          12       0.70      0.58      0.63       393
          13       0.82      0.80      0.81       396
          14       0.77      0.77      0.77       394
          15       0.55      0.86      0.67       398
          16       0.54      0.73      0.62       364
          17       0.81      0.77      0.79       376
          18       0.59      0.43      0.50       310
          19       0.48      0.18      0.26       251
           2       0.42      0.54      0.47       394
           3       0.58      0.70      0.63       392
           4       0.69      0.70      0.69       385
           5       0.80      0.73      0.77       395
           6       0.8