<a href="https://colab.research.google.com/github/TosigneSamuel/DS_ML_project_23/blob/main/text-classifier-lr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Text classifier Colab**

This Colab notebook allows you to categorise a set of scientific papers into two categories. This is experimental code

**Note**: Name your training file *training.csv*  and test file *testing.csv* (*title* column should be named 'Title' or 'title' and *abstract* column if present should be named 'Abstract' or 'abstract'), and upload it by pressing the upload button on the top left of the left sidebar. The results will appear in a folder named *RESULTS*. RESULTS folder will be automatically created by the code.


In [1]:
#@title Install Python packages { form-width: "20%" }

#@markdown Please execute this cell by pressing the _Play_ button
#@markdown on the left to download and import third-party software
#@markdown in this Colab notebook.

#@markdown This installs the software on the Colab
#@markdown notebook in the cloud and not on your computer.
from IPython.utils import io
try:
  with io.capture_output() as captured:
    # %shell pip install scispacy
    # %shell pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_md-0.5.0.tar.gz
    # %shell pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz
    # %shell pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bionlp13cg_md-0.5.1.tar.gz
    %shell pip install pyLDAvis==2.1.2
    %shell pip install import-ipynb
    %shell pip install pandas
    %shell pip install shutup

except subprocess.CalledProcessError:
  print(captured)
  raise
import shutup
shutup.please()

import os
import numpy as np
import spacy
# import scispacy
import pandas as pd
# from scispacy.abbreviation import AbbreviationDetector

from pathlib import Path
import collections
import csv
import multiprocessing as mp
from multiprocessing import Pool

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier



cpu_count = mp.cpu_count()

pd. set_option('display.max_colwidth', None)

In [None]:
#@title Create train/test datasets from human/animal datasets { form-width: "20%" }
animal = pd.read_csv('excludes_Animal_2200.csv')
human = pd.read_csv('includes_human_2400.csv')

#add target variable
animal['target'] = 0
human['target'] = 1

print(animal.columns)
print(human.columns)

#combine & shuffle the datasets
combined_data = pd.concat([animal, human], axis=0)
shuffled_combined_df = combined_data.sample(frac=1).reset_index(drop=True)

#create a 80-20 split from it
training, testing = train_test_split(shuffled_combined_df, test_size=0.2, random_state=42)


Index(['Title', 'Abstract', 'Primary Author', 'Journal', 'Year', 'Volume',
       'Issue', 'Pages', 'Comments', 'Eppi ID', 'target'],
      dtype='object')
Index(['Title', 'Abstract', 'Primary Author', 'Journal', 'Year', 'Volume',
       'Issue', 'Pages', 'Comments', 'Eppi ID', 'target'],
      dtype='object')


In [None]:
#@title File settings to get started  { form-width: "20%" }

#@markdown Please ensure the training.csv and testing.csv are uploaded and execute this cell by pressing the _Play_ button
#@markdown on the left

#@markdown The training.csv and testing.csv files should have 'title', optional 'abstract' fields. Additionally the file should have a 'target' field
#@markdown which indicates whether the title/abstract is an include (coded as 1) or exclude (coded as 0)
TRAIN_PATH = 'training.csv'
TEST_PATH = 'testing.csv'

results_folder = 'RESULTS'
RESULTS_FOLDER = results_folder     #***user input
if not os.path.isdir(RESULTS_FOLDER):
    os.makedirs(RESULTS_FOLDER)
RESULTS_PATH = Path(RESULTS_FOLDER)

In [None]:
#@title Read in input data as separate training.csv and testing.csv. **Ignore** this block if human/animal data was uploaded above { form-width: "20%" }
try:
    training = pd.read_csv(TRAIN_PATH)
    orig_colnames = training.columns
    print(orig_colnames)

    testing = pd.read_csv(TEST_PATH)

except Exception as e:
    print(e)
    raise

In [None]:
#@title Read in input data { form-width: "20%" }
rename_map = {'Title': 'title', 'Abstract': 'abstract'}
training.rename(columns = rename_map, inplace = True)
testing.rename(columns = rename_map, inplace = True)
print("Number of studies in the training dataset: " + str(training.shape[0]))
print("Number of studies in the training dataset: " + str(testing.shape[0]))

#rename the columns so that the relevant column names are 'title' and 'abstract'

try:
  training['title_orig'] = training['title']
  testing['title_orig'] = testing['title']
except Exception as e:
  print(e)
  print("Error- No title detected! Title is needed!")
  raise

# drop any duplicates based on 'title'
training.drop_duplicates(subset=['title'], inplace=True)
testing.drop_duplicates(subset=['title'], inplace=True)
print("Number of studies in the training dataset after de-dupe: " + str(training.shape[0]))
print("Number of studies in the testing dataset after de-dupe: " + str(testing.shape[0]))

training['titleabstract'] = training['title'] + " " + training['abstract']
training['titleabstract'] = training['titleabstract'].str.lower()

testing['titleabstract'] = testing['title'] + " " + testing['abstract']
testing['titleabstract'] = testing['titleabstract'].str.lower()

Number of studies in the training dataset: 3693
Number of studies in the training dataset: 925
Number of studies in the training dataset after de-dupe: 3693
Number of studies in the testing dataset after de-dupe: 925


In [None]:
#@title Fit logistic regression model (in progress) { form-width: "20%" }

#A sklearn pipeline comprising of tf-idf vectorizer (using tri-gram) and logistic regression model. The parameters for logistic regression
#are taken from prior hyper-parameter tuning.
text_clf = Pipeline([
                ('tfidfvect', TfidfVectorizer(ngram_range = (3,3), stop_words = 'english')),
                ('clf', LogisticRegression(C=100, max_iter = 5000, solver = 'liblinear', penalty = 'l2', class_weight = 'balanced')),
               ])
y_train = training['target']
model = text_clf.fit(training['titleabstract'].astype(str),y_train)



In [None]:
#@title Predict category and evaluate performance (in progress) { form-width: "20%" }

#Using the model that was fit to the training data above, evaluate the model's performance on test data.
data = testing['titleabstract'].astype(str)
y_test = testing['target']
yhat = model.predict(data)
yhat_probs = model.predict_proba(data)[:,1]
yhat_adjusted = np.zeros(data.shape[0], dtype=int)
THRESHOLD = 0.4
yhat_adjusted[yhat_probs >= THRESHOLD] = 1

report_dict = {}
decimal_places = 3
report_dict['Accuracy'] = accuracy_score(y_test, yhat_adjusted).round(decimal_places)
report_dict['Precision'] = precision_score(y_test,yhat_adjusted).round(decimal_places)
report_dict['Recall'] = recall_score(y_test, yhat_adjusted, average = 'binary').round(decimal_places)
report_dict['F1-Score'] = f1_score(y_test, yhat_adjusted).round(decimal_places)
report_dict['ROC_AUC'] = roc_auc_score(y_test, yhat_adjusted).round(decimal_places)
cm = confusion_matrix(y_test, yhat_adjusted)
FP = cm[0][1]
TN = cm[0][0]
FN = cm[1][0]
TP = cm[1][1]
specificity = (TN / (TN+FP)).round(decimal_places)
FPR = (FP/(FP+TN)).round(decimal_places)
FNR = (FN/(FN+TP)).round(decimal_places)
report_dict['FPR'] = FPR
report_dict['FNR'] = FNR
report_dict['Specificity'] = specificity

print('Classification report:\n{}'.format(report_dict))


Classification report:
{'Accuracy': 0.826, 'Precision': 0.764, 'Recall': 0.974, 'F1-Score': 0.856, 'ROC_AUC': 0.816, 'FPR': 0.343, 'FNR': 0.026, 'Specificity': 0.657}


**Data Preprocessing**


In [6]:
#@title Basic Preprocessing { form-width: "20%" }

# Load dataset
animal = pd.read_csv('excludes_Animal_2200.csv')
human = pd.read_csv('includes_human_2400.csv')

# prompt: shape of the df
(animal.shape), (human.shape)

#add target variable
animal['target'] = 0
human['target'] = 1

In [None]:
animal.isna().sum(), human.isna().sum()


(Title                0
 Abstract            61
 Primary Author       4
 Journal              0
 Year                 0
 Volume              30
 Issue              687
 Pages              190
 Comments          2212
 Eppi ID              0
 target               0
 dtype: int64,
 Title                0
 Abstract           410
 Primary Author       0
 Journal              0
 Year                 2
 Volume              25
 Issue              113
 Pages                6
 Comments          2411
 Eppi ID              0
 target               0
 dtype: int64)

In [None]:
animal.columns, human.columns

(Index(['Title', 'Abstract', 'Primary Author', 'Journal', 'Year', 'Volume',
        'Issue', 'Pages', 'Comments', 'Eppi ID', 'target'],
       dtype='object'),
 Index(['Title', 'Abstract', 'Primary Author', 'Journal', 'Year', 'Volume',
        'Issue', 'Pages', 'Comments', 'Eppi ID', 'target'],
       dtype='object'))

Deleting unwanted columns

In [None]:
human.drop(columns=['Primary Author', 'Journal', 'Year', 'Volume',
       'Issue', 'Pages', 'Comments', 'Eppi ID', 'target'], inplace=True)

In [None]:
animal.drop(columns=['Primary Author', 'Journal', 'Year', 'Volume',
       'Issue', 'Pages', 'Comments', 'Eppi ID', 'target'], inplace=True)

In [None]:
animal.columns.shape

(2,)

In [None]:
human.columns

Index(['Title', 'Abstract'], dtype='object')

In [None]:
animal.isna().sum()

Title        0
Abstract    61
dtype: int64

In [None]:
human.isnull().sum()

Title         0
Abstract    410
dtype: int64

In [None]:
# animal.sample(3)

In [None]:
animal.dropna(inplace=True)
animal.isna().sum()

Title       0
Abstract    0
dtype: int64

In [None]:
human.dropna(inplace=True)
human.isna().sum()

Title       0
Abstract    0
dtype: int64

In [None]:
animal['label'] = 0
human['label'] = 1

In [None]:
#Pretty balanced
animal.shape, human.shape

((2151, 3), (2001, 3))

In [None]:
#combine & shuffle the datasets
combined_df = pd.concat([animal, human], axis=0)
shuffled_combined_df = combined_df.sample(frac=1).reset_index(drop=True)

print(shuffled_combined_df.shape)
# shuffled_combined_df.head()

(4152, 3)


**Read in input data**

In [None]:
# rename the columns so that the relevant column names are 'title' and 'abstract'
rename_map = {'Title': 'title', 'Abstract': 'abstract'}
shuffled_combined_df.rename(columns = rename_map, inplace = True)
print("Number of studies in the training dataset: " + str(shuffled_combined_df.shape[0]))


try:
  shuffled_combined_df['title_orig'] = shuffled_combined_df['title']
except Exception as e:
  print(e)
  print("Error- No title detected! Title is needed!")
  raise


# drop any duplicates based on 'title'
shuffled_combined_df.drop_duplicates(subset=['title'], inplace=True)
print("Number of studies in the training dataset after de-dupe: " + str(shuffled_combined_df.shape[0]))

shuffled_combined_df['titleabstract'] = shuffled_combined_df['title'] + " " + shuffled_combined_df['abstract']
shuffled_combined_df['titleabstract'] = shuffled_combined_df['titleabstract'].str.lower()


#sanity check
shuffled_combined_df.columns

Number of studies in the training dataset: 4152
Number of studies in the training dataset after de-dupe: 4148


Index(['title', 'abstract', 'label', 'title_orig', 'titleabstract'], dtype='object')

**Text Preprocessing**

In [None]:
# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

In [None]:
#lemmatization
def lemmatization(titleabstract):
    doc = nlp(titleabstract)
    lemmalist = [word.lemma_ for word in doc]
    return " ".join(lemmalist)

In [None]:
shuffled_combined_df['lemma'] = shuffled_combined_df['titleabstract'].apply(lemmatization)

In [None]:
# shuffled_combined_df.head()

In [None]:
def remove_stopwords(titleabstract):
    doc = nlp(titleabstract)
    no_stopwords = [word.text for word in doc if not word.is_stop]
    return " ".join(no_stopwords)

In [None]:
shuffled_combined_df['stopwords'] = shuffled_combined_df['lemma'].apply(remove_stopwords)

In [101]:
# shuffled_combined_df.head()

**Train-Test Split: Splits the data into training and test sets.**

In [102]:
# Split the dataset into features (X) and target (y)
X = shuffled_combined_df['stopwords']
y = shuffled_combined_df['label']

# X.head()

# Split the dataset into training and test sets
# create a 80-20 split from it

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
train_X.shape, test_X.shape

((3318,), (830,))

****

**Initialize the Random Forest classifier**

In [None]:
from math import pi
classifier = Pipeline([
    ('Vectorizer_tfidf', TfidfVectorizer()),
    ('Random Forest', RandomForestClassifier(n_jobs = 1, random_state = 42)) # n_estimators = 100, max_depth = 10, min_samples_split = 2, min_samples_leaf = 1
 ])


# Train the model
classifier.fit(train_X, train_y)

In [None]:
classifier.score(test_X, test_y) * 100

96.02409638554217

**Evaluate the model**

In [108]:
pred = classifier.predict(test_X)
pred[:20]

array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1])

In [107]:
test_y[:20]

149     1
1973    1
238     1
1052    1
308     1
831     0
3897    0
1667    0
70      0
2167    0
2648    0
1906    0
810     0
318     0
179     0
2269    0
3394    1
3355    0
3575    0
3153    1
Name: label, dtype: int64

In [2]:
print(f'Accuracy: {accuracy_score(test_y, pred)}')
print(f'Precision: {precision_score(test_y, pred) *100}')
print(f'Recall: {recall_score(test_y, pred)}')
print(f'F1-Score: {f1_score(test_y, pred)}')
print(f'ROC_AUC: {roc_auc_score(test_y, pred)}')

NameError: name 'test_y' is not defined

In [117]:
print(f'classification Report: {classification_report(test_y, pred)}')

classification Report:               precision    recall  f1-score   support

           0       0.98      0.95      0.96       427
           1       0.94      0.98      0.96       403

    accuracy                           0.96       830
   macro avg       0.96      0.96      0.96       830
weighted avg       0.96      0.96      0.96       830

