### Simple methods

In [None]:
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

In [None]:
litcovid_dataset = pd.read_csv('drive/MyDrive/Biocreative/Biocreative/litcovid_dataset.csv')
litcovid_dataset.head()

Unnamed: 0,pmid,journal,title,abstract,keywords,pub_type,authors,doi,label,Case Report,Diagnosis,Epidemic Forecasting,Mechanism,Prevention,Transmission,Treatment
0,32519164,J Thromb Thrombolysis,Potential role for tissue factor in the pathog...,"In December 2019, a new and highly contagious ...",covid-19;il-6;sars-cov-2;tnf-alpha;thrombosis;...,Journal Article;Review,"Bautista-Vargas, Mario;Bonilla-Abadia, Fabio;C...",10.1007/s11239-020-02172-x,Treatment;Mechanism,0,0,0,1,0,0,1
1,32691006,J Tradit Complement Med,Dietary therapy and herbal medicine for COVID-...,"A novel coronavirus disease (COVID-19), transm...",covid-19;coronavirus;dietary therapy;herbal me...,Journal Article;Review,"Panyod, Suraphan;Ho, Chi-Tang;Sheen, Lee-Yan",10.1016/j.jtcme.2020.05.004,Treatment;Prevention,0,0,0,0,1,0,1
2,32858315,J Affect Disord,First report of manic-like symptoms in a COVID...,"BACKGROUND: In December 2019, the novel corona...",cerebrospinal fluid;igg;manic-like symptoms;sa...,Case Reports;Journal Article,"Lu, Shaojia;Wei, Ning;Jiang, Jiajun;Wu, Lingli...",10.1016/j.jad.2020.08.031,Case Report,1,0,0,0,0,0,0
3,32985329,J Dent Res,Epidemiological Investigation of OHCWs with CO...,During the coronavirus disease 2019 (COVID-19)...,dental education;dental public health;infectio...,"Journal Article;Research Support, Non-U.S. Gov't","Meng, L;Ma, B;Cheng, Y;Bian, Z",10.1177/0022034520962087,Prevention,0,0,0,0,1,0,0
4,32812051,J Antimicrob Chemother,The impact of sofosbuvir/daclatasvir or ribavi...,OBJECTIVES: Sofosbuvir and daclatasvir are dir...,,Journal Article;Randomized Controlled Trial;Re...,"Eslami, Gholamali;Mousaviasl, Sajedeh;Radmanes...",10.1093/jac/dkaa331,Treatment,0,0,0,0,0,0,1


In [None]:
litcovid_dataset = litcovid_dataset.drop(['pmid', 'journal', 'title', 'keywords', 'pub_type', 'authors', 'doi', 'label'], axis=1)
litcovid_dataset.head()

Unnamed: 0,abstract,Case Report,Diagnosis,Epidemic Forecasting,Mechanism,Prevention,Transmission,Treatment
0,"In December 2019, a new and highly contagious ...",0,0,0,1,0,0,1
1,"A novel coronavirus disease (COVID-19), transm...",0,0,0,0,1,0,1
2,"BACKGROUND: In December 2019, the novel corona...",1,0,0,0,0,0,0
3,During the coronavirus disease 2019 (COVID-19)...,0,0,0,0,1,0,0
4,OBJECTIVES: Sofosbuvir and daclatasvir are dir...,0,0,0,0,0,0,1


In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
import sys
import warnings

data = litcovid_dataset

if not sys.warnoptions:
  warnings.simplefilter("ignore")

def cleanHtml(sentence):
  cleanr = re.compile('<.*?>')
  cleantext = re.sub(cleanr, ' ', str(sentence))
  return cleantext

def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
  cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
  cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
  cleaned = cleaned.strip()
  cleaned = cleaned.replace("\n"," ")
  return cleaned

def keepAlpha(sentence):
  alpha_sent = ""
  for word in sentence.split():
      alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
      alpha_sent += alpha_word
      alpha_sent += " "
  alpha_sent = alpha_sent.strip()
  return alpha_sent

data['abstract'] = data['abstract'].str.lower()
data['abstract'] = data['abstract'].apply(cleanHtml)
data['abstract'] = data['abstract'].apply(cleanPunc)
data['abstract'] = data['abstract'].apply(keepAlpha)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
stop_words = set(stopwords.words('english'))
stop_words.update(['zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)
data['abstract'] = data['abstract'].apply(removeStopWords)

In [None]:
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence
data['abstract'] = data['abstract'].apply(stemming)

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, random_state=42, test_size=0.30, shuffle=True)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
train_text = train['abstract']
test_text = test['abstract']
vectorizer.fit(train_text)
vectorizer.fit(test_text)
x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['abstract'], axis=1)
x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['abstract'], axis=1)

In [None]:
# using one vs rest
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
            ])
categories = data.columns[1:]
for category in categories:
  print('**Processing {} comments...**'.format(category))
  
  # Training logistic regression model on train data
  LogReg_pipeline.fit(x_train, train[category])
  
  # calculating test accuracy
  prediction = LogReg_pipeline.predict(x_test)
  print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
  print("\n")

In [None]:
!pip install scikit-multilearn



In [None]:
# using classifier chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.naive_bayes import GaussianNB

# initialize classifier chains multi-label classifier
# with a gaussian naive bayes base classifier
classifier = ClassifierChain(GaussianNB())

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(X_test)

accuracy_score(y_test,predictions)