In [2]:
import numpy as np
import pandas as pd
import gensim
import nltk
import re
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [3]:

from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [4]:
train = pd.read_csv("/content/drive/MyDrive/daaaataaa_citae/tsv/train.tsv", sep= '\t',names=["id", "explicit", "text", "label"])
dev = pd.read_csv("/content/drive/MyDrive/daaaataaa_citae/tsv/dev.tsv", sep= '\t')
test = pd.read_csv("/content/drive/MyDrive/daaaataaa_citae/tsv/test.tsv", sep= '\t', names=["id", "explicit", "text", "label"])

In [5]:
train.drop(train.columns[[0, 1]], axis = 1, inplace = True)
test.drop(test.columns[[0, 1]], axis = 1, inplace = True)

In [6]:
train.head()

Unnamed: 0,text,label
0,"However, how frataxin interacts with the Fe-S ...",background
1,"In the study by Hickey et al. (2012), spikes w...",background
2,"The drug also reduces catecholamine secretion,...",background
3,By clustering with lowly aggressive close kin ...,background
4,Ophthalmic symptoms are rare manifestations of...,background


In [7]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
#function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
#storing the puntuation free text
train['text1']= train['text'].apply(lambda x:remove_punctuation(x))
test['text'] = test['text'].apply(lambda x:remove_punctuation(x))
train['labels'] = train['label'].replace(['background','method','result'],[0,1,2])
test['label'] = test['label'].replace(['background','method','result'],[0,1,2])
test.head()

Unnamed: 0,text,label
0,Chapel as well as X10 2 UPC 3 CoArray Fortran...,0
1,In addition the result of the present study su...,2
2,Several instruments that more specifically add...,0
3,Organotypic hippocampal slice culturesnInterfa...,1
4,Activated PBMC are the basis of the standard P...,0


In [10]:
#storing the lower cased text
train['text1']= train['text1'].apply(lambda x: x.lower())
test['text'] = test['text'].apply(lambda x: x.lower())

In [11]:
train.head()

Unnamed: 0,text,label,text1,labels
0,"However, how frataxin interacts with the Fe-S ...",background,however how frataxin interacts with the fes cl...,0
1,"In the study by Hickey et al. (2012), spikes w...",background,in the study by hickey et al 2012 spikes were ...,0
2,"The drug also reduces catecholamine secretion,...",background,the drug also reduces catecholamine secretion ...,0
3,By clustering with lowly aggressive close kin ...,background,by clustering with lowly aggressive close kin ...,0
4,Ophthalmic symptoms are rare manifestations of...,background,ophthalmic symptoms are rare manifestations of...,0


In [12]:
test.head()

Unnamed: 0,text,label
0,chapel as well as x10 2 upc 3 coarray fortran...,0
1,in addition the result of the present study su...,2
2,several instruments that more specifically add...,0
3,organotypic hippocampal slice culturesninterfa...,1
4,activated pbmc are the basis of the standard p...,0


In [13]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
#function for tokenization
def tokenize(word):
   word = nltk.word_tokenize(word)
   return word

In [15]:
#storing the tokenized text
train['tokenized']= train['text1'].apply(lambda x: tokenize(x))

In [16]:
#storing the tokenized text
test['tokenized']= test['text'].apply(lambda x: tokenize(x))

In [17]:
#function for remove stopwords
def remove_stopwords(texts):
    output= [i for i in texts if i not in stopwords]
    return output

In [18]:
#removed stop words from tokenized text
train['tokenized']= train['tokenized'].apply(lambda x:remove_stopwords(x))

In [19]:
#removed stop words from tokenized text
test['tokenized']= test['tokenized'].apply(lambda x:remove_stopwords(x))

In [20]:
from nltk.stem import WordNetLemmatizer
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

In [21]:
#download Wordnet through nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
#function for Lemmatization
def lemmatizer(text):
  lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
  return lemm_text
train['stemmed']=train['tokenized'].apply(lambda x:lemmatizer(x))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [22]:
test['stemmed']=test['tokenized'].apply(lambda x:lemmatizer(x))

In [23]:
train.head()

Unnamed: 0,text,label,text1,labels,tokenized,stemmed
0,"However, how frataxin interacts with the Fe-S ...",background,however how frataxin interacts with the fes cl...,0,"[however, frataxin, interacts, fes, cluster, b...","[however, frataxin, interacts, fe, cluster, bi..."
1,"In the study by Hickey et al. (2012), spikes w...",background,in the study by hickey et al 2012 spikes were ...,0,"[study, hickey, et, al, 2012, spikes, sampled,...","[study, hickey, et, al, 2012, spike, sampled, ..."
2,"The drug also reduces catecholamine secretion,...",background,the drug also reduces catecholamine secretion ...,0,"[drug, also, reduces, catecholamine, secretion...","[drug, also, reduces, catecholamine, secretion..."
3,By clustering with lowly aggressive close kin ...,background,by clustering with lowly aggressive close kin ...,0,"[clustering, lowly, aggressive, close, kin, ki...","[clustering, lowly, aggressive, close, kin, ki..."
4,Ophthalmic symptoms are rare manifestations of...,background,ophthalmic symptoms are rare manifestations of...,0,"[ophthalmic, symptoms, rare, manifestations, i...","[ophthalmic, symptom, rare, manifestation, int..."


In [24]:
x_test = test.text.tolist()

In [25]:
test.head()

Unnamed: 0,text,label,tokenized,stemmed
0,chapel as well as x10 2 upc 3 coarray fortran...,0,"[chapel, well, x10, 2, upc, 3, coarray, fortra...","[chapel, well, x10, 2, upc, 3, coarray, fortra..."
1,in addition the result of the present study su...,2,"[addition, result, present, study, supports, p...","[addition, result, present, study, support, pr..."
2,several instruments that more specifically add...,0,"[several, instruments, specifically, address, ...","[several, instrument, specifically, address, p..."
3,organotypic hippocampal slice culturesninterfa...,1,"[organotypic, hippocampal, slice, culturesnint...","[organotypic, hippocampal, slice, culturesnint..."
4,activated pbmc are the basis of the standard p...,0,"[activated, pbmc, basis, standard, pbmc, blast...","[activated, pbmc, basis, standard, pbmc, blast..."


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
td = TfidfVectorizer(max_features = 4500)
X = td.fit_transform(train['text1']).toarray()

In [28]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [29]:
td1 = TfidfVectorizer(max_features = 4500)
X_test = td1.fit_transform(test['text']).toarray()

In [30]:
y = train['labels'].tolist()

In [31]:
Y_test = test['label'].tolist()

In [34]:
#SGDClassifier
from sklearn.linear_model import SGDClassifier
lr = SGDClassifier(loss='log', penalty='l1')
lr.fit(X, y)
preds = lr.predict(X_test)
print(classification_report(Y_test, preds))


print('accuracy score_SGD:', accuracy_score(Y_test, preds))
print('recall score_SGD:',recall_score(Y_test, preds,average = 'macro'))
print('precision score_SGD:',precision_score(Y_test, preds,average = 'macro'))
print('f1 score_SGD:',f1_score(Y_test, preds,average = 'macro'))

              precision    recall  f1-score   support

           0       0.54      0.98      0.69       997
           1       0.38      0.02      0.03       605
           2       0.33      0.01      0.02       259

    accuracy                           0.53      1861
   macro avg       0.42      0.34      0.25      1861
weighted avg       0.46      0.53      0.38      1861

accuracy score_SGD: 0.5330467490596453
recall score_SGD: 0.3359498878054545
precision score_SGD: 0.4162627521580575
f1 score_SGD: 0.24779434541161818


In [36]:
#KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)
predictions = knn.predict(X_test)


print(classification_report(Y_test, predictions))
print('accuracy score_KNN:', accuracy_score(Y_test, predictions))
print('recall score_KNN:',recall_score(Y_test, predictions,average = 'macro'))
print('precision score_KNN:',precision_score(Y_test, predictions,average = 'macro'))
print('f1 score_KNN:',f1_score(Y_test, predictions,average = 'macro'))

              precision    recall  f1-score   support

           0       0.54      0.75      0.63       997
           1       0.36      0.24      0.28       605
           2       0.14      0.03      0.06       259

    accuracy                           0.48      1861
   macro avg       0.34      0.34      0.32      1861
weighted avg       0.42      0.48      0.44      1861

accuracy score_KNN: 0.4836109618484686
recall score_KNN: 0.3404544744564804
precision score_KNN: 0.3447983849343812
f1 score_KNN: 0.32168974825473


In [37]:
#Perceptron
from sklearn.datasets import load_digits
from sklearn.linear_model import Perceptron
clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X,y)
y_pred = clf.predict(X_test)



print(classification_report(Y_test, y_pred))
print('accuracy score_percep:', accuracy_score(Y_test, y_pred))
print('recall score_percep:',recall_score(Y_test, y_pred,average = 'macro'))
print('precision score_percep:',precision_score(Y_test, y_pred,average = 'macro'))
print('f1 score_percep:',f1_score(Y_test, y_pred,average = 'macro'))

              precision    recall  f1-score   support

           0       0.54      0.84      0.66       997
           1       0.35      0.18      0.24       605
           2       0.00      0.00      0.00       259

    accuracy                           0.51      1861
   macro avg       0.30      0.34      0.30      1861
weighted avg       0.40      0.51      0.43      1861

accuracy score_percep: 0.507791509940892
recall score_percep: 0.33934365078707196
precision score_percep: 0.2968831168831169
f1 score_percep: 0.2979421702228122


In [39]:
#SVM classifier
mod = SVC(kernel = 'linear', C=1)
mod.fit(X,y)
y_preds = clf.predict(X_test)

print(classification_report(Y_test, y_preds))
print('accuracy score_svm:', accuracy_score(Y_test, y_preds))
print('recall score_svm:',recall_score(Y_test, y_preds,average = 'macro'))
print('precision score_svm:',precision_score(Y_test, y_preds,average = 'macro'))
print('f1 score_svm:',f1_score(Y_test, y_preds,average = 'macro'))

              precision    recall  f1-score   support

           0       0.54      0.84      0.66       997
           1       0.35      0.18      0.24       605
           2       0.00      0.00      0.00       259

    accuracy                           0.51      1861
   macro avg       0.30      0.34      0.30      1861
weighted avg       0.40      0.51      0.43      1861

accuracy score_svm: 0.507791509940892
recall score_svm: 0.33934365078707196
precision score_svm: 0.2968831168831169
f1 score_svm: 0.2979421702228122
