# Bag of Word and TF-IDF with Machine Learning Classifier

In [None]:
# Update Google colab default scikit-learn to latest version.
!pip install --upgrade scikit-learn

Collecting scikit-learn
[?25l  Downloading https://files.pythonhosted.org/packages/f3/74/eb899f41d55f957e2591cde5528e75871f817d9fb46d4732423ecaca736d/scikit_learn-0.24.1-cp37-cp37m-manylinux2010_x86_64.whl (22.3MB)
[K     |████████████████████████████████| 22.3MB 49.4MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading https://files.pythonhosted.org/packages/f7/12/ec3f2e203afa394a149911729357aa48affc59c20e2c1c8297a60f33f133/threadpoolctl-2.1.0-py3-none-any.whl
Installing collected packages: threadpoolctl, scikit-learn
  Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.24.1 threadpoolctl-2.1.0


In [None]:
import sklearn
sklearn.show_versions()


System:
    python: 3.7.10 (default, Feb 20 2021, 21:17:23)  [GCC 7.5.0]
executable: /usr/bin/python3
   machine: Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic

Python dependencies:
          pip: 19.3.1
   setuptools: 56.0.0
      sklearn: 0.24.1
        numpy: 1.19.5
        scipy: 1.4.1
       Cython: 0.29.22
       pandas: 1.1.5
   matplotlib: 3.2.2
       joblib: 1.0.1
threadpoolctl: 2.1.0

Built with OpenMP: True


In [None]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline

# Text feature extraction and representation
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Machine Learning Classifier
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.semi_supervised import SelfTrainingClassifier


# Model selection
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV

# Evaluation metrics
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report


In [None]:
# Google Colab with Personal Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Change to project folder
import os
os.chdir('/content/drive/MyDrive/FinalYearProject')

## Supervised

In [None]:
train_preprocessed_data = pd.DataFrame(pd.read_csv('./Dataset_preprocessed/train_preprocessed.tsv', sep='\t'))
valid_preprocessed_data = pd.DataFrame(pd.read_csv('./Dataset_preprocessed/valid_preprocessed.tsv', sep='\t'))
test_preprocessed_data = pd.DataFrame(pd.read_csv('./Dataset_preprocessed/test_preprocessed.tsv', sep='\t'))

In [None]:
#train_preprocessed_data
#valid_preprocessed_data
#test_preprocessed_data

In [None]:
# We concatenate training set and validation set for a bigger training set.
# We will use grid search, cross validaiton method for tuning hyperparameters further.
cross_validation_data = pd.concat([train_preprocessed_data, valid_preprocessed_data], axis=0, ignore_index=True)
cross_validation_data

Unnamed: 0,Label,Text,Text_length,Text_pre_processed,Text_pre_processed_uncased,Text_cleaned_uncased,Text_cleaned_uncased_length,Text_cleaned_uncased_tokens,Text_cleaned_uncased_tokens_length
0,1,Official death toll from #covid19 in the Unite...,216,Official death toll from coronavirus in the Un...,official death toll from coronavirus in the un...,offici death toll coronavirus unit kingdom gre...,159,"['offici', 'death', 'toll', 'coronavirus', 'un...",28
1,1,"Dearest Mr. President @USER 1,169 coronavirus ...",220,Dearest Mr President 1169 coronavirus deaths ...,dearest mr president 1169 coronavirus deaths ...,dearest mr presid 1169 coronavirus death us 24...,146,"['dearest', 'mr', 'presid', '1169', 'coronavir...",24
2,1,Latest Updates March 20 ⚠️5274 new cases and 3...,233,Latest Updates March 20 5274 new cases and 38 ...,latest updates march 20 5274 new cases and 38 ...,latest updat march 20 5274 new case 38 new dea...,174,"['latest', 'updat', 'march', '20', '5274', 'ne...",32
3,1,真把公主不当干部 BREAKING: 21 people on Grand Princess...,205,BREAKING 21 people on Grand Princess cruise s...,breaking 21 people on grand princess cruise s...,break 21 peopl grand princess cruis ship dock...,136,"['break', '21', 'peopl', 'grand', 'princess', ...",28
4,0,OKLAHOMA CITY — The State Department of Educat...,187,OKLAHOMA CITY The State Department of Educati...,oklahoma city the state department of educati...,oklahoma citi state depart educ announc monday...,129,"['oklahoma', 'citi', 'state', 'depart', 'educ'...",23
...,...,...,...,...,...,...,...,...,...
7995,0,Coronavirus took hold in UK earlier than thoug...,135,coronavirus took hold in UK earlier than thoug...,coronavirus took hold in uk earlier than thoug...,coronavirus took hold uk earlier thought data ...,101,"['coronavirus', 'took', 'hold', 'uk', 'earlier...",17
7996,1,I talked with a man who is Rowan County’s seco...,218,I talked with a man who is Rowan Countys secon...,i talked with a man who is rowan countys secon...,talk man rowan counti second confirm case coro...,125,"['talk', 'man', 'rowan', 'counti', 'second', '...",20
7997,0,Governor Wolf delaying enforcement of non-life...,126,Governor Wolf delaying enforcement of nonlife ...,governor wolf delaying enforcement of nonlife ...,governor wolf delay enforc nonlif sustain busi...,86,"['governor', 'wolf', 'delay', 'enforc', 'nonli...",15
7998,0,The Sheriff's Department has reduced the jail ...,220,The Sheriffs Department has reduced the jail p...,the sheriffs department has reduced the jail p...,sheriff depart reduc jail popul 617 sheriff or...,128,"['sheriff', 'depart', 'reduc', 'jail', 'popul'...",24


In [None]:
# Test CountVectorizer() and TfidfTransformer()
corpus = cross_validation_data['Text_pre_processed'].head()
tfidf = TfidfTransformer()

vectorizer1 = CountVectorizer()
X1 = vectorizer1.fit_transform(corpus)
print(vectorizer1.get_feature_names())
print(X1.toarray())
print(X1.toarray().shape)
pipe1 = Pipeline([('count', vectorizer1),
                 ('tfidf', tfidf)]).fit(corpus)
#print(pipe1.transform(corpus))
print(pipe1.transform(corpus).shape)

vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(1, 2))
X2 = vectorizer2.fit_transform(corpus)
print(vectorizer2.get_feature_names())
print(X2.toarray())
print(X2.toarray().shape)
pipe2 = Pipeline([('count', vectorizer2),
                 ('tfidf', tfidf)]).fit(corpus)
#print(pipe2.transform(corpus))
print(pipe2.transform(corpus).shape)

vectorizer3 = CountVectorizer(analyzer='word', ngram_range=(1, 3))
X3 = vectorizer3.fit_transform(corpus)
print(vectorizer3.get_feature_names())
print(X3.toarray())
print(X3.toarray().shape)
pipe3 = Pipeline([('count', vectorizer3),
                 ('tfidf', tfidf)]).fit(corpus)
#print(pipe3.transform(corpus))
print(pipe3.transform(corpus).shape)


['100', '1169', '185', '19', '20', '21', '230', '233', '24', '38', '5274', '675', 'above', 'against', 'all', 'an', 'and', 'announced', 'april', 'as', 'at', 'austria', 'authorities', 'breaking', 'california', 'cases', 'china', 'chinahastocompensateall', 'city', 'climb', 'closure', 'coast', 'combined', 'community', 'coronavirus', 'coronaviruscountryus', 'crew', 'crime', 'cruise', 'cuomo', 'dead', 'dearest', 'death', 'deaths', 'department', 'disasster', 'docked', 'education', 'finland', 'for', 'from', 'germany', 'governo', 'governor', 'grand', 'greater', 'greece', 'group', 'grows', 'home', 'hours', 'illinois', 'in', 'including', 'international', 'ireland', 'is', 'issues', 'k12', 'kingdom', 'latest', 'least', 'march', 'members', 'mike', 'million', 'monday', 'mr', 'nature', 'negative', 'new', 'nonessential', 'norway', 'not', 'now', 'number', 'of', 'off', 'official', 'oklahoma', 'on', 'order', 'orders', 'pandemic', 'passengers', 'pence', 'pennssource', 'people', 'please', 'poland', 'portugal

In [None]:
X_train = cross_validation_data['Text_pre_processed']
y_train = cross_validation_data['Label']

X_test = test_preprocessed_data['Text_pre_processed']
y_test = test_preprocessed_data['Label']

In [None]:
def printResults(true_labels, predict_labels, target_names):
  print("Test Accuracy: %.3f" % accuracy_score(true_labels, predict_labels))
  print('Confusion Matrix:')
  print(confusion_matrix(true_labels, predict_labels))
  print('Classification Report:')
  print(classification_report(true_labels, predict_labels, target_names=target_names))
  print('\n')

### Machine Learning Classifiers with default hyper-parameters result

In [None]:
default_classifiers = [MultinomialNB(), # Naive Bayes
                       KNeighborsClassifier(), # KNN
                       LogisticRegression(random_state=0), # Logistic Regression
                       RandomForestClassifier(random_state=0), # Random Forest
                       svm.SVC(random_state=0), # Support Vector Machine
                      ]

In [None]:
for classifier in default_classifiers:
    bag_of_word = Pipeline(steps=[('count', CountVectorizer()),
                                  ('classifier', classifier)])
    bag_of_word.fit(X_train, y_train)
    bag_of_word_result = bag_of_word.predict(X_test)
    print(classifier)
    printResults(y_test, bag_of_word_result, ['Uninformative', 'Informative'])


MultinomialNB()
Test Accuracy: 0.750
Confusion Matrix:
[[834 222]
 [278 666]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.75      0.79      0.77      1056
  Informative       0.75      0.71      0.73       944

     accuracy                           0.75      2000
    macro avg       0.75      0.75      0.75      2000
 weighted avg       0.75      0.75      0.75      2000



KNeighborsClassifier()
Test Accuracy: 0.650
Confusion Matrix:
[[762 294]
 [406 538]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.65      0.72      0.69      1056
  Informative       0.65      0.57      0.61       944

     accuracy                           0.65      2000
    macro avg       0.65      0.65      0.65      2000
 weighted avg       0.65      0.65      0.65      2000





STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=0)
Test Accuracy: 0.760
Confusion Matrix:
[[852 204]
 [276 668]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.76      0.81      0.78      1056
  Informative       0.77      0.71      0.74       944

     accuracy                           0.76      2000
    macro avg       0.76      0.76      0.76      2000
 weighted avg       0.76      0.76      0.76      2000



RandomForestClassifier(random_state=0)
Test Accuracy: 0.755
Confusion Matrix:
[[867 189]
 [300 644]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.74      0.82      0.78      1056
  Informative       0.77      0.68      0.72       944

     accuracy                           0.76      2000
    macro avg       0.76      0.75      0.75      2000
 weighted avg       0.76      0.76      0.75      2000



SVC(random_state=0)
Test Accuracy: 0.756
Confusion Matrix:
[[867 189]
 [298 646]]
Clas

In [None]:
for classifier in default_classifiers:
    bag_of_ngram= Pipeline(steps=[('count', CountVectorizer(ngram_range=(1, 2))),
                                  ('classifier', classifier)])
    bag_of_ngram.fit(X_train, y_train)
    bag_of_ngram_result = bag_of_ngram.predict(X_test)
    print(classifier)
    printResults(y_test, bag_of_ngram_result, ['Uninformative', 'Informative'])

MultinomialNB()
Test Accuracy: 0.770
Confusion Matrix:
[[817 239]
 [220 724]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.79      0.77      0.78      1056
  Informative       0.75      0.77      0.76       944

     accuracy                           0.77      2000
    macro avg       0.77      0.77      0.77      2000
 weighted avg       0.77      0.77      0.77      2000



KNeighborsClassifier()
Test Accuracy: 0.624
Confusion Matrix:
[[706 350]
 [403 541]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.64      0.67      0.65      1056
  Informative       0.61      0.57      0.59       944

     accuracy                           0.62      2000
    macro avg       0.62      0.62      0.62      2000
 weighted avg       0.62      0.62      0.62      2000





STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=0)
Test Accuracy: 0.782
Confusion Matrix:
[[888 168]
 [268 676]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.77      0.84      0.80      1056
  Informative       0.80      0.72      0.76       944

     accuracy                           0.78      2000
    macro avg       0.78      0.78      0.78      2000
 weighted avg       0.78      0.78      0.78      2000



RandomForestClassifier(random_state=0)
Test Accuracy: 0.753
Confusion Matrix:
[[884 172]
 [322 622]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.73      0.84      0.78      1056
  Informative       0.78      0.66      0.72       944

     accuracy                           0.75      2000
    macro avg       0.76      0.75      0.75      2000
 weighted avg       0.76      0.75      0.75      2000



SVC(random_state=0)
Test Accuracy: 0.761
Confusion Matrix:
[[839 217]
 [261 683]]
Clas

In [None]:
for classifier in default_classifiers:
    bag_of_ngram= Pipeline(steps=[('count', CountVectorizer(ngram_range=(1, 3))),
                                  ('classifier', classifier)])
    bag_of_ngram.fit(X_train, y_train)
    bag_of_ngram_result = bag_of_ngram.predict(X_test)
    print(classifier)
    printResults(y_test, bag_of_ngram_result, ['Uninformative', 'Informative'])

MultinomialNB()
Test Accuracy: 0.777
Confusion Matrix:
[[817 239]
 [207 737]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.80      0.77      0.79      1056
  Informative       0.76      0.78      0.77       944

     accuracy                           0.78      2000
    macro avg       0.78      0.78      0.78      2000
 weighted avg       0.78      0.78      0.78      2000



KNeighborsClassifier()
Test Accuracy: 0.592
Confusion Matrix:
[[565 491]
 [326 618]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.63      0.54      0.58      1056
  Informative       0.56      0.65      0.60       944

     accuracy                           0.59      2000
    macro avg       0.60      0.59      0.59      2000
 weighted avg       0.60      0.59      0.59      2000



LogisticRegression(random_state=0)
Test Accuracy: 0.781
Confusion Matrix:
[[898 158]
 [279 665]]
Classification Report:
 

In [None]:
for classifier in default_classifiers:
    tf_idf = Pipeline(steps=[('count', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('classifier', classifier)])
    tf_idf.fit(X_train, y_train)
    tf_idf_result = tf_idf.predict(X_test)
    print(classifier)
    printResults(y_test, tf_idf_result, ['Uninformative', 'Informative'])

MultinomialNB()
Test Accuracy: 0.744
Confusion Matrix:
[[867 189]
 [323 621]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.73      0.82      0.77      1056
  Informative       0.77      0.66      0.71       944

     accuracy                           0.74      2000
    macro avg       0.75      0.74      0.74      2000
 weighted avg       0.75      0.74      0.74      2000



KNeighborsClassifier()
Test Accuracy: 0.703
Confusion Matrix:
[[658 398]
 [196 748]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.77      0.62      0.69      1056
  Informative       0.65      0.79      0.72       944

     accuracy                           0.70      2000
    macro avg       0.71      0.71      0.70      2000
 weighted avg       0.71      0.70      0.70      2000



LogisticRegression(random_state=0)
Test Accuracy: 0.764
Confusion Matrix:
[[873 183]
 [289 655]]
Classification Report:
 

In [None]:
for classifier in default_classifiers:
    ngram_tf_idf = Pipeline(steps=[('count', CountVectorizer(ngram_range=(1, 2))),
                             ('tfidf', TfidfTransformer()),
                             ('classifier', classifier)])
    ngram_tf_idf.fit(X_train, y_train)
    ngram_tf_idf_result = ngram_tf_idf.predict(X_test)
    print(classifier)
    printResults(y_test, ngram_tf_idf_result , ['Uninformative', 'Informative'])

MultinomialNB()
Test Accuracy: 0.763
Confusion Matrix:
[[856 200]
 [273 671]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.76      0.81      0.78      1056
  Informative       0.77      0.71      0.74       944

     accuracy                           0.76      2000
    macro avg       0.76      0.76      0.76      2000
 weighted avg       0.76      0.76      0.76      2000



KNeighborsClassifier()
Test Accuracy: 0.710
Confusion Matrix:
[[662 394]
 [187 757]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.78      0.63      0.70      1056
  Informative       0.66      0.80      0.72       944

     accuracy                           0.71      2000
    macro avg       0.72      0.71      0.71      2000
 weighted avg       0.72      0.71      0.71      2000



LogisticRegression(random_state=0)
Test Accuracy: 0.772
Confusion Matrix:
[[840 216]
 [240 704]]
Classification Report:
 

In [None]:
for classifier in default_classifiers:
    ngram_tf_idf = Pipeline(steps=[('count', CountVectorizer(ngram_range=(1, 3))),
                             ('tfidf', TfidfTransformer()),
                             ('classifier', classifier)])
    ngram_tf_idf.fit(X_train, y_train)
    ngram_tf_idf_result = ngram_tf_idf.predict(X_test)
    print(classifier)
    printResults(y_test, ngram_tf_idf_result , ['Uninformative', 'Informative'])

MultinomialNB()
Test Accuracy: 0.771
Confusion Matrix:
[[851 205]
 [252 692]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.77      0.81      0.79      1056
  Informative       0.77      0.73      0.75       944

     accuracy                           0.77      2000
    macro avg       0.77      0.77      0.77      2000
 weighted avg       0.77      0.77      0.77      2000



KNeighborsClassifier()
Test Accuracy: 0.705
Confusion Matrix:
[[666 390]
 [201 743]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.77      0.63      0.69      1056
  Informative       0.66      0.79      0.72       944

     accuracy                           0.70      2000
    macro avg       0.71      0.71      0.70      2000
 weighted avg       0.72      0.70      0.70      2000



LogisticRegression(random_state=0)
Test Accuracy: 0.763
Confusion Matrix:
[[792 264]
 [210 734]]
Classification Report:
 

### Navie Bayes GridSearchCV

In [None]:
NB_bag_of_word = Pipeline(steps=[('count', CountVectorizer()),
                                  ('nb', MultinomialNB())])

NB_tfidf = Pipeline(steps=[('count', CountVectorizer()),
                            ('tfidf', TfidfTransformer()),
                            ('nb', MultinomialNB())])

NB_param_grid = {'count__ngram_range':[(1, 1),(1, 2),(1, 3)],}

cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)

In [None]:
NB_bag_of_word_CV = GridSearchCV(NB_bag_of_word, NB_param_grid, cv=cv, scoring='f1',verbose=4, n_jobs=-1)
NB_bag_of_word_CV.fit(X_train, y_train)

print('Bag of Word:')
#print(NB_bag_of_word_CV.cv_results_)
print(NB_bag_of_word_CV.best_score_)
print(NB_bag_of_word_CV.best_params_)
print(NB_bag_of_word_CV.best_estimator_)
print('\n')

NB_bag_of_word_CV_result = NB_bag_of_word_CV.predict(X_test)

printResults(y_test, NB_bag_of_word_CV_result, ['Uninformative', 'Informative'])

# Bag of Word:
# 0.8386949097468058
# {'count__ngram_range': (1, 3)}
# Pipeline(steps=[('count', CountVectorizer(ngram_range=(1, 3))),
#                 ('nb', MultinomialNB())])


# Test Accuracy: 0.777
# Confusion Matrix:
# [[817 239]
#  [207 737]]
# Classification Report:
#                precision    recall  f1-score   support

# Uninformative       0.80      0.77      0.79      1056
#   Informative       0.76      0.78      0.77       944

#      accuracy                           0.78      2000
#     macro avg       0.78      0.78      0.78      2000
#  weighted avg       0.78      0.78      0.78      2000

Fitting 10 folds for each of 3 candidates, totalling 30 fits
Bag of Word:
0.8386949097468058
{'count__ngram_range': (1, 3)}
Pipeline(steps=[('count', CountVectorizer(ngram_range=(1, 3))),
                ('nb', MultinomialNB())])


Test Accuracy: 0.777
Confusion Matrix:
[[817 239]
 [207 737]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.80      0.77      0.79      1056
  Informative       0.76      0.78      0.77       944

     accuracy                           0.78      2000
    macro avg       0.78      0.78      0.78      2000
 weighted avg       0.78      0.78      0.78      2000





In [None]:
NB_tfidf_CV = GridSearchCV(NB_tfidf, NB_param_grid, cv=cv, scoring='f1',verbose=4, n_jobs=-1)
NB_tfidf_CV.fit(X_train, y_train)

print('TF-IDF:')
#print(NB_tfidf_CV.cv_results_)
print(NB_tfidf_CV.best_score_)
print(NB_tfidf_CV.best_params_)
print(NB_tfidf_CV.best_estimator_)
print('\n')

NB_tfidf_CV_result = NB_tfidf_CV.predict(X_test)

printResults(y_test, NB_tfidf_CV_result, ['Uninformative', 'Informative'])

# TF-IDF:
# 0.8369831957703721
# {'count__ngram_range': (1, 3)}
# Pipeline(steps=[('count', CountVectorizer(ngram_range=(1, 3))),
#                 ('tfidf', TfidfTransformer()), ('nb', MultinomialNB())])


# Test Accuracy: 0.771
# Confusion Matrix:
# [[851 205]
#  [252 692]]
# Classification Report:
#                precision    recall  f1-score   support

# Uninformative       0.77      0.81      0.79      1056
#   Informative       0.77      0.73      0.75       944

#      accuracy                           0.77      2000
#     macro avg       0.77      0.77      0.77      2000
#  weighted avg       0.77      0.77      0.77      2000

Fitting 10 folds for each of 3 candidates, totalling 30 fits
TF-IDF:
0.8369831957703721
{'count__ngram_range': (1, 3)}
Pipeline(steps=[('count', CountVectorizer(ngram_range=(1, 3))),
                ('tfidf', TfidfTransformer()), ('nb', MultinomialNB())])


Test Accuracy: 0.771
Confusion Matrix:
[[851 205]
 [252 692]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.77      0.81      0.79      1056
  Informative       0.77      0.73      0.75       944

     accuracy                           0.77      2000
    macro avg       0.77      0.77      0.77      2000
 weighted avg       0.77      0.77      0.77      2000





### KNN GridSearchCV

In [None]:
KNN_bag_of_word = Pipeline(steps=[('count', CountVectorizer()),
                                  ('knn', KNeighborsClassifier())])

KNN_tfidf = Pipeline(steps=[('count', CountVectorizer()),
                            ('tfidf', TfidfTransformer()),
                            ('knn', KNeighborsClassifier())])

KNN_param_grid = {'count__ngram_range':[(1, 1),(1, 2),(1, 3)],
                  'knn__n_neighbors': [x for x in range(3,103) if (x % 2) != 0],
                  'knn__weights': ['distance'],
                  }

cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)

In [None]:
KNN_bag_of_word_CV = GridSearchCV(KNN_bag_of_word, KNN_param_grid, cv=cv, scoring='f1',verbose=4, n_jobs=-1)
KNN_bag_of_word_CV.fit(X_train, y_train)

print('Bag of Word:')
#print(KNN_bag_of_word_CV.cv_results_)
print(KNN_bag_of_word_CV.best_score_)
print(KNN_bag_of_word_CV.best_params_)
print(KNN_bag_of_word_CV.best_estimator_)
print('\n')

KNN_bag_of_word_CV_result = KNN_bag_of_word_CV.predict(X_test)

printResults(y_test, KNN_bag_of_word_CV_result, ['Uninformative', 'Informative'])

# Bag of Word:
# 0.6812581231654369
# {'count__ngram_range': (1, 1), 'knn__n_neighbors': 13, 'knn__weights': 'distance'}
# Pipeline(steps=[('count', CountVectorizer()),
#                 ('knn',
#                  KNeighborsClassifier(n_neighbors=13, weights='distance'))])


# Test Accuracy: 0.664
# Confusion Matrix:
# [[804 252]
#  [420 524]]
# Classification Report:
#                precision    recall  f1-score   support

# Uninformative       0.66      0.76      0.71      1056
#   Informative       0.68      0.56      0.61       944

#      accuracy                           0.66      2000
#     macro avg       0.67      0.66      0.66      2000
#  weighted avg       0.67      0.66      0.66      2000

Fitting 10 folds for each of 150 candidates, totalling 1500 fits
Bag of Word:
0.6812581231654369
{'count__ngram_range': (1, 1), 'knn__n_neighbors': 13, 'knn__weights': 'distance'}
Pipeline(steps=[('count', CountVectorizer()),
                ('knn',
                 KNeighborsClassifier(n_neighbors=13, weights='distance'))])


Test Accuracy: 0.664
Confusion Matrix:
[[804 252]
 [420 524]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.66      0.76      0.71      1056
  Informative       0.68      0.56      0.61       944

     accuracy                           0.66      2000
    macro avg       0.67      0.66      0.66      2000
 weighted avg       0.67      0.66      0.66      2000





In [None]:
KNN_tfidf_CV = GridSearchCV(KNN_tfidf, KNN_param_grid, cv=cv, scoring='f1',verbose=7, n_jobs=-1)
KNN_tfidf_CV.fit(X_train, y_train)

print('TF-IDF:')
#print(KNN_tfidf_CV.cv_results_)
print(KNN_tfidf_CV.best_score_)
print(KNN_tfidf_CV.best_params_)
print(KNN_tfidf_CV.best_estimator_)
print('\n')

KNN_tfidf_CV_result = KNN_tfidf_CV.predict(X_test)

printResults(y_test, KNN_tfidf_CV_result, ['Uninformative', 'Informative'])

# TF-IDF:
# 0.8109941941698933
# {'count__ngram_range': (1, 3), 'knn__n_neighbors': 35, 'knn__weights': 'distance'}
# Pipeline(steps=[('count', CountVectorizer(ngram_range=(1, 3))),
#                 ('tfidf', TfidfTransformer()),
#                 ('knn',
#                  KNeighborsClassifier(n_neighbors=35, weights='distance'))])


# Test Accuracy: 0.736
# Confusion Matrix:
# [[671 385]
#  [143 801]]
# Classification Report:
#                precision    recall  f1-score   support

# Uninformative       0.82      0.64      0.72      1056
#   Informative       0.68      0.85      0.75       944

#      accuracy                           0.74      2000
#     macro avg       0.75      0.74      0.73      2000
#  weighted avg       0.75      0.74      0.73      2000



Fitting 10 folds for each of 150 candidates, totalling 1500 fits
TF-IDF:
0.8109941941698933
{'count__ngram_range': (1, 3), 'knn__n_neighbors': 35, 'knn__weights': 'distance'}
Pipeline(steps=[('count', CountVectorizer(ngram_range=(1, 3))),
                ('tfidf', TfidfTransformer()),
                ('knn',
                 KNeighborsClassifier(n_neighbors=35, weights='distance'))])


Test Accuracy: 0.736
Confusion Matrix:
[[671 385]
 [143 801]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.82      0.64      0.72      1056
  Informative       0.68      0.85      0.75       944

     accuracy                           0.74      2000
    macro avg       0.75      0.74      0.73      2000
 weighted avg       0.75      0.74      0.73      2000





### LogisticRegression GridSearchCV

In [None]:
LR_bag_of_word = Pipeline(steps=[('count', CountVectorizer()),
                                 ('lr', LogisticRegression(random_state=0))])

LR_tfidf = Pipeline(steps=[('count', CountVectorizer()),
                           ('tfidf', TfidfTransformer()),
                           ('lr', LogisticRegression(random_state=0))])

LR_param_grid = {'count__ngram_range':[(1, 1),(1, 2),(1, 3)],
                 'lr__solver':['lbfgs','liblinear'],
                 'lr__max_iter':range(100, 400, 100),
                }
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)


In [None]:
LR_bag_of_word_CV = GridSearchCV(LR_bag_of_word, LR_param_grid, cv=cv, scoring='f1',verbose=4, n_jobs=-1)
LR_bag_of_word_CV.fit(X_train, y_train)

print('Bag of Word:')
#print(LR_bag_of_word_CV.cv_results_)
print(LR_bag_of_word_CV.best_score_)#
print(LR_bag_of_word_CV.best_params_)
print(LR_bag_of_word_CV.best_estimator_)
print('\n')

LR_bag_of_word_CV_result = LR_bag_of_word_CV.predict(X_test)

printResults(y_test, LR_bag_of_word_CV_result, ['Uninformative', 'Informative'])

# Bag of Word:
# 0.8534109484399744
# {'count__ngram_range': (1, 2), 'lr__max_iter': 100, 'lr__penalty': 'l2', 'lr__solver': 'liblinear'}
# Pipeline(steps=[('count', CountVectorizer(ngram_range=(1, 2))),
#                 ('lr', LogisticRegression(random_state=0, solver='liblinear'))])


# Test Accuracy: 0.779
# Confusion Matrix:
# [[886 170]
#  [272 672]]
# Classification Report:
#                precision    recall  f1-score   support

# Uninformative       0.77      0.84      0.80      1056
#   Informative       0.80      0.71      0.75       944

#      accuracy                           0.78      2000
#     macro avg       0.78      0.78      0.78      2000
#  weighted avg       0.78      0.78      0.78      2000



Fitting 10 folds for each of 18 candidates, totalling 180 fits
Bag of Word:
0.8532939662146717
{'count__ngram_range': (1, 2), 'lr__max_iter': 100, 'lr__solver': 'liblinear'}
Pipeline(steps=[('count', CountVectorizer(ngram_range=(1, 2))),
                ('lr', LogisticRegression(random_state=0, solver='liblinear'))])


Test Accuracy: 0.779
Confusion Matrix:
[[886 170]
 [272 672]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.77      0.84      0.80      1056
  Informative       0.80      0.71      0.75       944

     accuracy                           0.78      2000
    macro avg       0.78      0.78      0.78      2000
 weighted avg       0.78      0.78      0.78      2000





In [None]:
LR_tfidf_CV = GridSearchCV(LR_tfidf, LR_param_grid, cv=cv, scoring='f1',verbose=4, n_jobs=-1)
LR_tfidf_CV.fit(X_train, y_train)

print('TF-IDF:')
#print(LR_tfidf_CV.cv_results_)
print(LR_tfidf_CV.best_score_)#
print(LR_tfidf_CV.best_params_)
print(LR_tfidf_CV.best_estimator_)
print('\n')

LR_tfidf_CV_result = LR_tfidf_CV.predict(X_test)

printResults(y_test, LR_tfidf_CV_result, ['Uninformative', 'Informative'])

# TF-IDF:
# 0.8413418528295928
# {'count__ngram_range': (1, 2), 'lr__max_iter': 100, 'lr__penalty': 'l2', 'lr__solver': 'lbfgs'}
# Pipeline(steps=[('count', CountVectorizer(ngram_range=(1, 2))),
#                 ('tfidf', TfidfTransformer()),
#                 ('lr', LogisticRegression(random_state=0))])


# Test Accuracy: 0.772
# Confusion Matrix:
# [[840 216]
#  [240 704]]
# Classification Report:
#                precision    recall  f1-score   support

# Uninformative       0.78      0.80      0.79      1056
#   Informative       0.77      0.75      0.76       944

#      accuracy                           0.77      2000
#     macro avg       0.77      0.77      0.77      2000
#  weighted avg       0.77      0.77      0.77      2000



Fitting 10 folds for each of 18 candidates, totalling 180 fits
TF-IDF:
0.8413418528295928
{'count__ngram_range': (1, 2), 'lr__max_iter': 100, 'lr__solver': 'lbfgs'}
Pipeline(steps=[('count', CountVectorizer(ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('lr', LogisticRegression(random_state=0))])


Test Accuracy: 0.772
Confusion Matrix:
[[840 216]
 [240 704]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.78      0.80      0.79      1056
  Informative       0.77      0.75      0.76       944

     accuracy                           0.77      2000
    macro avg       0.77      0.77      0.77      2000
 weighted avg       0.77      0.77      0.77      2000





### RandomForest GridSearchCV

In [None]:
RF_bag_of_word = Pipeline(steps=[('count', CountVectorizer()),
                           ('rf', RandomForestClassifier(random_state=0, n_jobs=-1))])


RF_tfidf = Pipeline(steps=[('count', CountVectorizer()),
                           ('tfidf', TfidfTransformer()),
                           ('rf', RandomForestClassifier(random_state=0, n_jobs=-1))])


RF_param_grid = {'count__ngram_range':[(1, 1),(1, 2),(1, 3)],
                 'rf__n_estimators': range(100,400, 100),
                 'rf__criterion':['gini','entropy'],
                 }

cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)


In [None]:
RF_bag_of_word_CV = GridSearchCV(RF_bag_of_word, RF_param_grid, cv=cv, scoring='f1',verbose=4, n_jobs=-1)
RF_bag_of_word_CV.fit(X_train, y_train)

print('Bag of Word:')
#print(RF_bag_of_word_CV.cv_results_)
print(RF_bag_of_word_CV.best_score_)
print(RF_bag_of_word_CV.best_params_)
print(RF_bag_of_word_CV.best_estimator_)
print('\n')

RF_bag_of_word_CV_result = RF_bag_of_word_CV.predict(X_test)
printResults(y_test, RF_bag_of_word_CV_result, ['Uninformative', 'Informative'])

# Bag of Word:
# 0.8212575719588717
# {'count__ngram_range': (1, 2), 'rf__criterion': 'gini', 'rf__n_estimators': 300}
# Pipeline(steps=[('count', CountVectorizer(ngram_range=(1, 2))),
#                 ('rf',
#                  RandomForestClassifier(n_estimators=300, n_jobs=-1,
#                                         random_state=0))])


# Test Accuracy: 0.755
# Confusion Matrix:
# [[883 173]
#  [317 627]]
# Classification Report:
#                precision    recall  f1-score   support

# Uninformative       0.74      0.84      0.78      1056
#   Informative       0.78      0.66      0.72       944

#      accuracy                           0.76      2000
#     macro avg       0.76      0.75      0.75      2000
#  weighted avg       0.76      0.76      0.75      2000



In [None]:
RF_tfidf_CV = GridSearchCV(RF_tfidf, RF_param_grid, cv=cv, scoring='f1',verbose=4, n_jobs=-1)
RF_tfidf_CV.fit(X_train, y_train)

print('TF-IDF:')
#print(RF_tfidf_CV.cv_results_)
print(RF_tfidf_CV.best_score_)#
print(RF_tfidf_CV.best_params_)
print(RF_tfidf_CV.best_estimator_)
print('\n')

RF_tfidf_CV_result = RF_tfidf_CV.predict(X_test)
printResults(y_test, RF_tfidf_CV_result, ['Uninformative', 'Informative'])

# TF-IDF:
# 0.8267672430952194
# {'count__ngram_range': (1, 2), 'rf__criterion': 'entropy', 'rf__n_estimators': 300}
# Pipeline(steps=[('count', CountVectorizer(ngram_range=(1, 2))),
#                 ('tfidf', TfidfTransformer()),
#                 ('rf',
#                  RandomForestClassifier(criterion='entropy', n_estimators=300,
#                                         n_jobs=-1, random_state=0))])


# Test Accuracy: 0.752
# Confusion Matrix:
# [[734 322]
#  [173 771]]
# Classification Report:
#                precision    recall  f1-score   support

# Uninformative       0.81      0.70      0.75      1056
#   Informative       0.71      0.82      0.76       944

#      accuracy                           0.75      2000
#     macro avg       0.76      0.76      0.75      2000
#  weighted avg       0.76      0.75      0.75      2000


### SVM GridSearchCV

In [None]:
SVM_bag_of_word = Pipeline(steps=[('count', CountVectorizer()),
                                  ('svm', svm.SVC(kernel='rbf', random_state=0))])


SVM_tfidf = Pipeline(steps=[('count', CountVectorizer()),
                            ('tfidf', TfidfTransformer()),
                            ('svm', svm.SVC(kernel='rbf', random_state=0))])


SVM_param_grid = {'count__ngram_range':[(1, 1),(1, 2),(1, 3)],
                  'svm__C': [0.1,0.5,1,3,5], #
                  'svm__gamma':[0.7,0.8,0.9,1,'auto','scale'], #
                  }

cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)


In [None]:
SVM_bag_of_word_CV = GridSearchCV(SVM_bag_of_word, SVM_param_grid, cv=cv, scoring='f1',verbose=7, n_jobs=-1)
SVM_bag_of_word_CV.fit(X_train, y_train)

print('Bag of Word:')
#print(SVM_bag_of_word_CV.cv_results_)
print(SVM_bag_of_word_CV.best_score_)
print(SVM_bag_of_word_CV.best_params_)
print(SVM_bag_of_word_CV.best_estimator_)
print('\n')

SVM_bag_of_word_CV_result = SVM_bag_of_word_CV.predict(X_test)

printResults(y_test, SVM_bag_of_word_CV_result, ['Uninformative', 'Informative'])

# Bag of Word:
# 0.8514456006409846
# {'count__ngram_range': (1, 3), 'svm__C': 5, 'svm__gamma': 'scale'}
# Pipeline(steps=[('count', CountVectorizer(ngram_range=(1, 3))),
#                 ('svm', SVC(C=5, random_state=0))])


# Test Accuracy: 0.771
# Confusion Matrix:
# [[806 250]
#  [207 737]]
# Classification Report:
#                precision    recall  f1-score   support

# Uninformative       0.80      0.76      0.78      1056
#   Informative       0.75      0.78      0.76       944

#      accuracy                           0.77      2000
#     macro avg       0.77      0.77      0.77      2000
#  weighted avg       0.77      0.77      0.77      2000



In [None]:
SVM_tfidf_CV = GridSearchCV(SVM_tfidf, SVM_param_grid, cv=cv, scoring='f1',verbose=7, n_jobs=-1)
SVM_tfidf_CV.fit(X_train, y_train)

print('TF-IDF:')
#print(SVM_tfidf_CV.cv_results_)
print(SVM_tfidf_CV.best_score_)#
print(SVM_tfidf_CV.best_params_)
print(SVM_tfidf_CV.best_estimator_)
print('\n')

SVM_tfidf_CV_result = SVM_tfidf_CV.predict(X_test)

printResults(y_test, SVM_tfidf_CV_result, ['Uninformative', 'Informative'])

# TF-IDF:
# 0.859673551850527
# {'count__ngram_range': (1, 2), 'svm__C': 3, 'svm__gamma': 0.7}
# Pipeline(steps=[('count', CountVectorizer(ngram_range=(1, 2))),
#                 ('tfidf', TfidfTransformer()),
#                 ('svm', SVC(C=3, gamma=0.7, random_state=0))])


# Test Accuracy: 0.789
# Confusion Matrix:
# [[831 225]
#  [196 748]]
# Classification Report:
#                precision    recall  f1-score   support

# Uninformative       0.81      0.79      0.80      1056
#   Informative       0.77      0.79      0.78       944

#      accuracy                           0.79      2000
#     macro avg       0.79      0.79      0.79      2000
#  weighted avg       0.79      0.79      0.79      2000



### Best Estimators

In [None]:
best_NB = Pipeline(steps=[('count', CountVectorizer(ngram_range=(1,3))),
                          ('nb', MultinomialNB())])

best_KNN = Pipeline(steps=[('count', CountVectorizer(ngram_range=(1,3))),
                           ('tfidf', TfidfTransformer()),
                           ('knn', KNeighborsClassifier(n_neighbors=35, weights='distance'))])

best_LR = Pipeline(steps=[('count', CountVectorizer(ngram_range=(1,2))),
                          ('lr', LogisticRegression(solver='liblinear', max_iter=100, penalty='l2', random_state=0))])

best_RF = Pipeline(steps=[('count', CountVectorizer(ngram_range=(1,2))),
                          ('rf', RandomForestClassifier(n_estimators=300,criterion= 'gini', random_state=0))])

best_SVM =Pipeline(steps=[('count', CountVectorizer(ngram_range=(1,2))),
                          ('tfidf', TfidfTransformer()),
                          ('svm', svm.SVC(kernel='rbf', C=3, gamma=0.7, probability=True, random_state=0))])

best_classifiers = [best_NB, best_KNN, best_LR, best_RF, best_SVM]

In [None]:
for classifier in best_classifiers:
    classifier.fit(X_train, y_train)
    classifier_result = classifier.predict(X_test)
    print(classifier)
    printResults(y_test, classifier_result , ['Uninformative', 'Informative'])

Pipeline(steps=[('count', CountVectorizer(ngram_range=(1, 3))),
                ('nb', MultinomialNB())])
Test Accuracy: 0.777
Confusion Matrix:
[[817 239]
 [207 737]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.80      0.77      0.79      1056
  Informative       0.76      0.78      0.77       944

     accuracy                           0.78      2000
    macro avg       0.78      0.78      0.78      2000
 weighted avg       0.78      0.78      0.78      2000



Pipeline(steps=[('count', CountVectorizer(ngram_range=(1, 3))),
                ('tfidf', TfidfTransformer()),
                ('knn',
                 KNeighborsClassifier(n_neighbors=35, weights='distance'))])
Test Accuracy: 0.736
Confusion Matrix:
[[671 385]
 [143 801]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.82      0.64      0.72      1056
  Informative       0.68      0.85      0.75       944

     accur

### Use all the Best Estimators with a Voting Classifier

In [None]:
voting_hard_clf = VotingClassifier(estimators=[('nb', best_NB),
                                               ('knn', best_KNN),
                                               ('lr', best_LR),
                                               ('rf', best_RF),
                                               ('svm', best_SVM)], voting='hard', n_jobs=-1)

voting_soft_clf = VotingClassifier(estimators=[('nb', best_NB),
                                               ('knn', best_KNN),
                                               ('lr', best_LR),
                                               ('rf', best_RF),
                                               ('svm', best_SVM)], voting='soft', n_jobs=-1)
voting_hard_clf.fit(X_train, y_train)
voting_hard_clf_result = voting_hard_clf.predict(X_test)

printResults(y_test, voting_hard_clf_result, ['Uninformative', 'Informative'])


voting_soft_clf.fit(X_train, y_train)
voting_soft_clf_result = voting_soft_clf.predict(X_test)

printResults(y_test, voting_soft_clf_result, ['Uninformative', 'Informative'])

Test Accuracy: 0.787
Confusion Matrix:
[[847 209]
 [216 728]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.80      0.80      0.80      1056
  Informative       0.78      0.77      0.77       944

     accuracy                           0.79      2000
    macro avg       0.79      0.79      0.79      2000
 weighted avg       0.79      0.79      0.79      2000



Test Accuracy: 0.789
Confusion Matrix:
[[827 229]
 [193 751]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.81      0.78      0.80      1056
  Informative       0.77      0.80      0.78       944

     accuracy                           0.79      2000
    macro avg       0.79      0.79      0.79      2000
 weighted avg       0.79      0.79      0.79      2000





In [None]:
voting_hard_clf2 = VotingClassifier(estimators=[('nb', best_NB),
                                               ('knn', best_KNN),
                                               ('lr', best_LR),
                                               ('rf', best_RF),
                                               ('svm', best_SVM)], voting='hard', weights=[5,1,5,1,5], n_jobs=-1)

voting_hard_clf2.fit(X_train, y_train)
voting_hard_clf2_result = voting_hard_clf2.predict(X_test)
printResults(y_test, voting_hard_clf2_result, ['Uninformative', 'Informative'])

Test Accuracy: 0.790
Confusion Matrix:
[[848 208]
 [211 733]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.80      0.80      0.80      1056
  Informative       0.78      0.78      0.78       944

     accuracy                           0.79      2000
    macro avg       0.79      0.79      0.79      2000
 weighted avg       0.79      0.79      0.79      2000





In [None]:
voting_soft_clf2 = VotingClassifier(estimators=[('nb', best_NB),
                                               ('knn', best_KNN),
                                               ('lr', best_LR),
                                               ('rf', best_RF),
                                               ('svm', best_SVM)], voting='soft', weights=[5,1,5,1,5], n_jobs=-1)

voting_soft_clf2.fit(X_train, y_train)
voting_soft_clf2_result = voting_soft_clf2.predict(X_test)

printResults(y_test, voting_soft_clf2_result, ['Uninformative', 'Informative'])

# Test Accuracy: 0.798
# Confusion Matrix:
# [[841 215]
#  [189 755]]
# Classification Report:
#                precision    recall  f1-score   support

# Uninformative       0.82      0.80      0.81      1056
#   Informative       0.78      0.80      0.79       944

#      accuracy                           0.80      2000
#     macro avg       0.80      0.80      0.80      2000
#  weighted avg       0.80      0.80      0.80      2000

Test Accuracy: 0.798
Confusion Matrix:
[[841 215]
 [189 755]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.82      0.80      0.81      1056
  Informative       0.78      0.80      0.79       944

     accuracy                           0.80      2000
    macro avg       0.80      0.80      0.80      2000
 weighted avg       0.80      0.80      0.80      2000





## Semi-Supervised

In [None]:
unlabeled_test_with_noise_preprocessed = pd.DataFrame(pd.read_csv('./Dataset_preprocessed/unlabeled_test_with_noise_preprocessed.tsv', sep='\t'))

In [None]:
unlabeled = -np.ones(len(unlabeled_test_with_noise_preprocessed),dtype=np.int8)
unlabeled_test_with_noise_preprocessed.insert(0,'Label',unlabeled)
unlabeled_test_with_noise_preprocessed

Unnamed: 0,Label,Text,Text_length,Text_pre_processed,Text_pre_processed_uncased,Text_cleaned_uncased,Text_cleaned_uncased_length,Text_cleaned_uncased_tokens,Text_cleaned_uncased_tokens_length
0,-1,Fox Business' Lou Dobbs Self-Quarantines After...,79,Fox Business Lou Dobbs SelfQuarantines After S...,fox business lou dobbs selfquarantines after s...,fox busi lou dobb selfquarantin staffer test p...,54,"['fox', 'busi', 'lou', 'dobb', 'selfquarantin'...",9
1,-1,Results from UVRI showed the sample is positiv...,210,Results from UVRI showed the sample is positiv...,results from uvri showed the sample is positiv...,result uvri show sampl posit coronavirus confi...,128,"['result', 'uvri', 'show', 'sampl', 'posit', '...",27
2,-1,"Today or tomorrow, the number of #COVIDー19 cas...",150,Today or tomorrow the number of coronavirus ca...,today or tomorrow the number of coronavirus ca...,today tomorrow number coronavirus case global ...,99,"['today', 'tomorrow', 'number', 'coronavirus',...",20
3,-1,Ramsey County veterans experiencing negative f...,169,Ramsey County veterans experiencing negative f...,ramsey county veterans experiencing negative f...,ramsey counti veteran experienc negat financi ...,114,"['ramsey', 'counti', 'veteran', 'experienc', '...",17
4,-1,The #Covid19 death rate in New Orleans is 7x h...,177,The coronavirus death rate in New Orleans is 7...,the coronavirus death rate in new orleans is 7...,coronavirus death rate new orlean 7x higher ne...,117,"['coronavirus', 'death', 'rate', 'new', 'orlea...",24
...,...,...,...,...,...,...,...,...,...
11995,-1,COVID-19 in Michigan: Confirmed cases now at 7...,76,coronavirus in Michigan Confirmed cases now at...,coronavirus in michigan confirmed cases now at...,coronavirus michigan confirm case 787 5 death ...,52,"['coronavirus', 'michigan', 'confirm', 'case',...",10
11996,-1,BREAKING: Two patients recovered from Covid-19...,68,BREAKING Two patients recovered from coronavir...,breaking two patients recovered from coronavir...,break two patient recov coronavirus discharg d...,49,"['break', 'two', 'patient', 'recov', 'coronavi...",9
11997,-1,"This is Jains Kences Retreat, Virugambakkam. (...",186,This is Jains Kences Retreat Virugambakkam Yes...,this is jains kences retreat virugambakkam yes...,jain kenc retreat virugambakkam yes apart got ...,111,"['jain', 'kenc', 'retreat', 'virugambakkam', '...",19
11998,-1,@USER The more they address the more the sprea...,210,The more they address the more the spread Ger...,the more they address the more the spread ger...,address spread germani 12300 confirm coronavi...,116,"['address', 'spread', 'germani', '12300', 'con...",23


In [None]:
semi_supervised_data = pd.concat([train_preprocessed_data, valid_preprocessed_data, unlabeled_test_with_noise_preprocessed], axis=0, ignore_index=True)
semi_supervised_data

Unnamed: 0,Label,Text,Text_length,Text_pre_processed,Text_pre_processed_uncased,Text_cleaned_uncased,Text_cleaned_uncased_length,Text_cleaned_uncased_tokens,Text_cleaned_uncased_tokens_length
0,1,Official death toll from #covid19 in the Unite...,216,Official death toll from coronavirus in the Un...,official death toll from coronavirus in the un...,offici death toll coronavirus unit kingdom gre...,159,"['offici', 'death', 'toll', 'coronavirus', 'un...",28
1,1,"Dearest Mr. President @USER 1,169 coronavirus ...",220,Dearest Mr President 1169 coronavirus deaths ...,dearest mr president 1169 coronavirus deaths ...,dearest mr presid 1169 coronavirus death us 24...,146,"['dearest', 'mr', 'presid', '1169', 'coronavir...",24
2,1,Latest Updates March 20 ⚠️5274 new cases and 3...,233,Latest Updates March 20 5274 new cases and 38 ...,latest updates march 20 5274 new cases and 38 ...,latest updat march 20 5274 new case 38 new dea...,174,"['latest', 'updat', 'march', '20', '5274', 'ne...",32
3,1,真把公主不当干部 BREAKING: 21 people on Grand Princess...,205,BREAKING 21 people on Grand Princess cruise s...,breaking 21 people on grand princess cruise s...,break 21 peopl grand princess cruis ship dock...,136,"['break', '21', 'peopl', 'grand', 'princess', ...",28
4,0,OKLAHOMA CITY — The State Department of Educat...,187,OKLAHOMA CITY The State Department of Educati...,oklahoma city the state department of educati...,oklahoma citi state depart educ announc monday...,129,"['oklahoma', 'citi', 'state', 'depart', 'educ'...",23
...,...,...,...,...,...,...,...,...,...
19995,-1,COVID-19 in Michigan: Confirmed cases now at 7...,76,coronavirus in Michigan Confirmed cases now at...,coronavirus in michigan confirmed cases now at...,coronavirus michigan confirm case 787 5 death ...,52,"['coronavirus', 'michigan', 'confirm', 'case',...",10
19996,-1,BREAKING: Two patients recovered from Covid-19...,68,BREAKING Two patients recovered from coronavir...,breaking two patients recovered from coronavir...,break two patient recov coronavirus discharg d...,49,"['break', 'two', 'patient', 'recov', 'coronavi...",9
19997,-1,"This is Jains Kences Retreat, Virugambakkam. (...",186,This is Jains Kences Retreat Virugambakkam Yes...,this is jains kences retreat virugambakkam yes...,jain kenc retreat virugambakkam yes apart got ...,111,"['jain', 'kenc', 'retreat', 'virugambakkam', '...",19
19998,-1,@USER The more they address the more the sprea...,210,The more they address the more the spread Ger...,the more they address the more the spread ger...,address spread germani 12300 confirm coronavi...,116,"['address', 'spread', 'germani', '12300', 'con...",23


In [None]:
X_semi_train = semi_supervised_data['Text_pre_processed']
y_semi_train = semi_supervised_data['Label']

X_test = test_preprocessed_data['Text_pre_processed']
y_test = test_preprocessed_data['Label']

### SelfTraining Pipelines using the best parameters

In [None]:
semi_NB = Pipeline(steps=[('count', CountVectorizer(ngram_range=(1,3))),
                          ('nb', SelfTrainingClassifier(MultinomialNB(), threshold=0.99, max_iter=15, verbose=True))])

semi_KNN = Pipeline(steps=[('count', CountVectorizer(ngram_range=(1,3))),
                           ('tfidf', TfidfTransformer()),
                           ('knn', SelfTrainingClassifier(KNeighborsClassifier(n_neighbors=35, weights='distance'), threshold=0.99, max_iter=15, verbose=True) )])

semi_LR = Pipeline(steps=[('count', CountVectorizer(ngram_range=(1,2))),
                          ('lr', SelfTrainingClassifier(LogisticRegression(solver='liblinear', max_iter=100, penalty='l2', random_state=0), threshold=0.99, max_iter=15, verbose=True))])

semi_RF = Pipeline(steps=[('count', CountVectorizer(ngram_range=(1,2))),
                          ('rf', SelfTrainingClassifier(RandomForestClassifier(n_estimators=300, criterion='gini', random_state=0, n_jobs=-1), threshold=0.9, max_iter=5, verbose=True) )])

semi_SVM = Pipeline(steps=[('count', CountVectorizer(ngram_range=(1,2))),
                           ('tfidf', TfidfTransformer()),
                           ('svm', SelfTrainingClassifier(svm.SVC(kernel='rbf', C=3, gamma=0.7, probability=True, random_state=0), threshold=0.97, max_iter=5, verbose=True))])


In [None]:
semi_NB.fit(X_semi_train, y_semi_train)
semi_NB_probablity = semi_NB.predict_proba(X_test)
semi_NB_result = semi_NB.predict(X_test)
printResults(y_test, semi_NB_result, ['Uninformative', 'Informative'])

End of iteration 1, added 9386 new labels.
End of iteration 2, added 1301 new labels.
End of iteration 3, added 160 new labels.
End of iteration 4, added 24 new labels.
End of iteration 5, added 3 new labels.
Test Accuracy: 0.769
Confusion Matrix:
[[784 272]
 [189 755]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.81      0.74      0.77      1056
  Informative       0.74      0.80      0.77       944

     accuracy                           0.77      2000
    macro avg       0.77      0.77      0.77      2000
 weighted avg       0.77      0.77      0.77      2000





In [None]:
semi_KNN.fit(X_semi_train, y_semi_train)
semi_KNN_probablity = semi_KNN.predict_proba(X_test)
semi_KNN_result = semi_KNN.predict(X_test)
printResults(y_test, semi_KNN_result, ['Uninformative', 'Informative'])

End of iteration 1, added 257 new labels.
End of iteration 2, added 212 new labels.
End of iteration 3, added 120 new labels.
End of iteration 4, added 87 new labels.
End of iteration 5, added 51 new labels.
End of iteration 6, added 36 new labels.
End of iteration 7, added 39 new labels.
End of iteration 8, added 23 new labels.
End of iteration 9, added 4 new labels.
End of iteration 10, added 2 new labels.
End of iteration 11, added 2 new labels.
End of iteration 12, added 2 new labels.
End of iteration 13, added 1 new labels.
Test Accuracy: 0.716
Confusion Matrix:
[[611 445]
 [123 821]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.83      0.58      0.68      1056
  Informative       0.65      0.87      0.74       944

     accuracy                           0.72      2000
    macro avg       0.74      0.72      0.71      2000
 weighted avg       0.75      0.72      0.71      2000





In [None]:
semi_LR.fit(X_semi_train, y_semi_train)
semi_LR_probablity = semi_LR.predict_proba(X_test)
semi_LR_result = semi_LR.predict(X_test)
printResults(y_test, semi_LR_result, ['Uninformative', 'Informative'])

End of iteration 1, added 2208 new labels.
End of iteration 2, added 213 new labels.
End of iteration 3, added 51 new labels.
End of iteration 4, added 14 new labels.
End of iteration 5, added 4 new labels.
End of iteration 6, added 1 new labels.
Test Accuracy: 0.783
Confusion Matrix:
[[885 171]
 [263 681]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.77      0.84      0.80      1056
  Informative       0.80      0.72      0.76       944

     accuracy                           0.78      2000
    macro avg       0.79      0.78      0.78      2000
 weighted avg       0.78      0.78      0.78      2000





In [None]:
semi_RF.fit(X_semi_train, y_semi_train)
semi_RF_probablity = semi_RF.predict_proba(X_test)
semi_RF_result = semi_RF.predict(X_test)
printResults(y_test, semi_RF_result, ['Uninformative', 'Informative'])

End of iteration 1, added 22 new labels.
End of iteration 2, added 11 new labels.
End of iteration 3, added 5 new labels.
End of iteration 4, added 7 new labels.
End of iteration 5, added 6 new labels.
Test Accuracy: 0.757
Confusion Matrix:
[[894 162]
 [324 620]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.73      0.85      0.79      1056
  Informative       0.79      0.66      0.72       944

     accuracy                           0.76      2000
    macro avg       0.76      0.75      0.75      2000
 weighted avg       0.76      0.76      0.75      2000





In [None]:
semi_SVM.fit(X_semi_train, y_semi_train)
semi_SVM_probablity = semi_SVM.predict_proba(X_test)
semi_SVM_result = semi_SVM.predict(X_test)
printResults(y_test, semi_SVM_result, ['Uninformative', 'Informative'])

End of iteration 1, added 3243 new labels.
End of iteration 2, added 469 new labels.
End of iteration 3, added 131 new labels.
End of iteration 4, added 91 new labels.
End of iteration 5, added 43 new labels.
Test Accuracy: 0.775
Confusion Matrix:
[[909 147]
 [303 641]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.75      0.86      0.80      1056
  Informative       0.81      0.68      0.74       944

     accuracy                           0.78      2000
    macro avg       0.78      0.77      0.77      2000
 weighted avg       0.78      0.78      0.77      2000





### Hard Voting

In [None]:
semi_results = {'semi_NB_result': semi_NB_result, 
                'semi_KNN_result': semi_KNN_result, 
                'semi_LR_result': semi_LR_result,
                'semi_RF_result': semi_RF_result,
                'semi_SVM_result': semi_SVM_result}

semi_results_df = pd.DataFrame(data=semi_results)
semi_results_df.to_csv('./Semi_Voting_Results/semi_ML_results.tsv', sep='\t', index=False,header=True)

In [None]:
semi_results_data =  pd.DataFrame(pd.read_csv('./Semi_Voting_Results/semi_ML_results.tsv', sep='\t'))
semi_results_data['semi_hard_voting_clf_result'] = semi_results_data.apply(lambda x: max(x.to_list(),key=x.to_list().count), axis=1)
semi_results_data

Unnamed: 0,semi_NB_result,semi_KNN_result,semi_LR_result,semi_RF_result,semi_SVM_result,semi_hard_voting_clf_result
0,1,1,0,1,1,1
1,1,1,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
...,...,...,...,...,...,...
1995,0,0,0,0,0,0
1996,0,1,0,0,1,0
1997,1,1,1,1,1,1
1998,1,1,1,1,1,1


In [None]:
semi_hard_voting_clf_result = semi_results_data['semi_hard_voting_clf_result'].to_list()
printResults(y_test, semi_hard_voting_clf_result, ['Uninformative', 'Informative'])

Test Accuracy: 0.783
Confusion Matrix:
[[868 188]
 [246 698]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.78      0.82      0.80      1056
  Informative       0.79      0.74      0.76       944

     accuracy                           0.78      2000
    macro avg       0.78      0.78      0.78      2000
 weighted avg       0.78      0.78      0.78      2000





### Soft Voting

In [None]:
semi_NB_probablity_dict = {'Uninformative':np.round(semi_NB_probablity[:,0], 5), 
                           'Informative': np.round(semi_NB_probablity[:,1], 5)}
semi_NB_probablity_df = pd.DataFrame(data=semi_NB_probablity_dict)
semi_NB_probablity_df.to_csv('./Semi_Voting_Results/semi_NB_probablity.tsv', sep='\t', index=False,header=True)

semi_KNN_probablity_dict = {'Uninformative':np.round(semi_KNN_probablity[:,0], 5), 
                            'Informative': np.round(semi_KNN_probablity[:,1], 5)}
semi_KNN_probablity_df = pd.DataFrame(data=semi_KNN_probablity_dict)
semi_KNN_probablity_df.to_csv('./Semi_Voting_Results/semi_KNN_probablity.tsv', sep='\t', index=False,header=True)

semi_LR_probablity_dict = {'Uninformative':np.round(semi_LR_probablity[:,0], 5), 
                           'Informative': np.round(semi_LR_probablity[:,1], 5)}
semi_LR_probablity_df = pd.DataFrame(data=semi_LR_probablity_dict)
semi_LR_probablity_df.to_csv('./Semi_Voting_Results/semi_LR_probablity.tsv', sep='\t', index=False,header=True)

semi_RF_probablity_dict = {'Uninformative':np.round(semi_RF_probablity[:,0], 5), 
                           'Informative': np.round(semi_RF_probablity[:,1], 5)}
semi_RF_probablity_df = pd.DataFrame(data=semi_RF_probablity_dict)
semi_RF_probablity_df.to_csv('./Semi_Voting_Results/semi_RF_probablity.tsv', sep='\t', index=False,header=True)

semi_SVM_probablity_dict = {'Uninformative':np.round(semi_SVM_probablity[:,0], 5), 
                            'Informative': np.round(semi_SVM_probablity[:,1], 5)}
semi_SVM_probablity_df = pd.DataFrame(data=semi_SVM_probablity_dict)
semi_SVM_probablity_df.to_csv('./Semi_Voting_Results/semi_SVM_probablity.tsv', sep='\t', index=False,header=True)

In [None]:
semi_NB_probablity = pd.read_csv('./Semi_Voting_Results/semi_NB_probablity.tsv', sep='\t').to_numpy()
semi_KNN_probablity = pd.read_csv('./Semi_Voting_Results/semi_KNN_probablity.tsv', sep='\t').to_numpy()
semi_LR_probablity = pd.read_csv('./Semi_Voting_Results/semi_LR_probablity.tsv', sep='\t').to_numpy()
semi_RF_probablity = pd.read_csv('./Semi_Voting_Results/semi_RF_probablity.tsv', sep='\t').to_numpy()
semi_SVM_probablity = pd.read_csv('./Semi_Voting_Results/semi_SVM_probablity.tsv', sep='\t').to_numpy()

In [None]:
semi_results_soft_voting_confidence = (semi_NB_probablity + semi_KNN_probablity + semi_LR_probablity + semi_RF_probablity + semi_SVM_probablity)/5

semi_results_soft_voting_confidence_dict = {'Uninformative':np.round(semi_results_soft_voting_confidence[:,0], 5), 
                                            'Informative': np.round(semi_results_soft_voting_confidence[:,1], 5)}

semi_results_soft_voting_confidence_df = pd.DataFrame(data=semi_results_soft_voting_confidence_dict)
semi_results_soft_voting_confidence_df.to_csv('./Semi_Voting_Results/semi_ML_soft_voting_confidence.tsv', sep='\t', index=False,header=True)

In [None]:
semi_results_soft_voting_confidence_data =  pd.DataFrame(pd.read_csv('./Semi_Voting_Results/semi_ML_soft_voting_confidence.tsv', sep='\t'))
semi_results_soft_voting_confidence_data['semi_soft_voting_clf_result'] = semi_results_soft_voting_confidence_data.apply(lambda x: 0 if x['Uninformative'] > x['Informative'] else 1, axis=1)
semi_results_soft_voting_confidence_data

Unnamed: 0,Uninformative,Informative,semi_soft_voting_clf_result
0,0.38526,0.61474,1
1,0.40771,0.59229,1
2,0.87706,0.12294,0
3,0.91279,0.08721,0
4,0.85988,0.14012,0
...,...,...,...
1995,0.89138,0.10862,0
1996,0.54671,0.45329,0
1997,0.06617,0.93383,1
1998,0.26121,0.73879,1


In [None]:
semi_soft_voting_clf_result = semi_results_soft_voting_confidence_data['semi_soft_voting_clf_result'].to_list()
printResults(y_test, semi_soft_voting_clf_result, ['Uninformative', 'Informative'])

Test Accuracy: 0.792
Confusion Matrix:
[[840 216]
 [199 745]]
Classification Report:
               precision    recall  f1-score   support

Uninformative       0.81      0.80      0.80      1056
  Informative       0.78      0.79      0.78       944

     accuracy                           0.79      2000
    macro avg       0.79      0.79      0.79      2000
 weighted avg       0.79      0.79      0.79      2000



