#Reference

https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

#Phrase

In [1]:
!pip install simpletransformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [13]:
np.random.seed(500)

In [44]:
def load_dataset(file_name):
    import pandas as pd
    import json
    
    df = []
    with open(file_name,'r', encoding='utf-8') as f:
        for i in f:
            i = eval(i)
            tweet = i['postText']
            label = i['tags']
            
            assert len(tweet) == 1
            tweet = tweet[0]
            
            assert len(label) == 1
            label = label[0]
            
            if label not in ['phrase', 'phrases', 'passage', 'multi']:
                print(label)
                
            assert label in ['phrase', 'phrases', 'passage', 'multi']
            
            df += [{'text': tweet, 'labels': (label == 'phrase' or label == 'phrases')}]

    return pd.DataFrame(df)
train_dataset = load_dataset('train.jsonl')
validation_dataset = load_dataset('validation.jsonl')

In [15]:
train_dataset.head()

Unnamed: 0,text,labels
0,"Wes Welker Wanted Dinner With Tom Brady, But P...",False
1,NASA sets date for full recovery of ozone hole,True
2,This is what makes employees happy -- and it's...,True
3,Passion is overrated — 7 work habits you need ...,False
4,The perfect way to cook rice so that it's perf...,True


In [16]:
validation_dataset.head()

Unnamed: 0,text,labels
0,Five Nights at Freddy’s Sequel Delayed for Wei...,False
1,Why Arizona Sheriff Joe Arpaio’s fate could ha...,False
2,Here’s how much you should be tipping your hai...,True
3,"""Harry Potter"" alums reunite for new movie",False
4,A man swallowed a microSD card and you won't b...,False


In [17]:
train_dataset['text'].dropna(inplace=True)
train_dataset['text'] = [entry.lower() for entry in train_dataset['text']]
train_dataset['text']= [word_tokenize(entry) for entry in train_dataset['text']]
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(train_dataset['text']):
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    train_dataset.loc[index,'text_final'] = str(Final_words)

In [18]:
validation_dataset['text'].dropna(inplace=True)
validation_dataset['text'] = [entry.lower() for entry in validation_dataset['text']]
validation_dataset['text']= [word_tokenize(entry) for entry in validation_dataset['text']]
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(validation_dataset['text']):
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    validation_dataset.loc[index,'text_final'] = str(Final_words)

In [19]:
Train_X, Test_X, Train_Y, Test_Y = train_dataset['text_final'],validation_dataset['text_final'],train_dataset['labels'],validation_dataset['labels']

In [20]:
Train_X.head()

0    ['wes', 'welker', 'want', 'dinner', 'tom', 'br...
1    ['nasa', 'set', 'date', 'full', 'recovery', 'o...
2            ['make', 'employee', 'happy', 'paycheck']
3    ['passion', 'overrate', 'work', 'habit', 'need...
4    ['perfect', 'way', 'cook', 'rice', 'perfectly'...
Name: text_final, dtype: object

In [21]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [22]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(pd.concat([train_dataset['text_final'],validation_dataset['text_final']]))
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [23]:
print(Tfidf_vect.vocabulary_)



In [24]:
print(Train_X_Tfidf)

  (0, 4866)	0.36140951627409035
  (0, 4862)	0.26681309018944405
  (0, 4861)	0.36140951627409035
  (0, 4810)	0.20801348707138728
  (0, 4470)	0.28612205584927547
  (0, 3341)	0.34437238667907366
  (0, 3004)	0.36140951627409035
  (0, 2183)	0.2690849262542588
  (0, 1446)	0.3322843508591913
  (0, 637)	0.3322843508591913
  (1, 3826)	0.3952314576815434
  (1, 3442)	0.4756948917078287
  (1, 2852)	0.42651594573361046
  (1, 2139)	0.3952314576815434
  (1, 1965)	0.4187655034738572
  (1, 1308)	0.3225184659150115
  (2, 3009)	0.6273580300407333
  (2, 2602)	0.2958519304484588
  (2, 2080)	0.48213854935885586
  (2, 1676)	0.5351971194013183
  (3, 4936)	0.43967956144531756
  (3, 2869)	0.43339259418065684
  (3, 2230)	0.5562610639603557
  (3, 2064)	0.5562610639603557
  (4, 4840)	0.24899370077565605
  :	:
  (3196, 2944)	0.29406853291565777
  (3196, 2850)	0.40178523228390006
  (3196, 2075)	0.34276169051724903
  (3196, 1616)	0.2842308562402412
  (3196, 1131)	0.32288417612709586
  (3196, 915)	0.3876819113800853
 

In [25]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
predictions_NB = Naive.predict(Test_X_Tfidf)
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  65.375


In [26]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
predictions_SVM = SVM.predict(Test_X_Tfidf)
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  65.75


In [27]:
logreg = LogisticRegression()
logreg.fit(Train_X_Tfidf,Train_Y)
predictions_log = logreg.predict(Test_X_Tfidf)
print("Log Accuracy Score -> ",accuracy_score(predictions_log, Test_Y)*100)

Log Accuracy Score ->  65.5


Bert model

In [35]:
!rm -rf outputs/

In [36]:
import sklearn
from simpletransformers.classification import ClassificationModel, ClassificationArgs
model_args = ClassificationArgs(num_train_epochs=5)
model = ClassificationModel("bert", "bert-base-uncased", args = model_args,num_labels=2,use_cuda=True)

model.train_model(train_dataset, acc=sklearn.metrics.accuracy_score)

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  0%|          | 0/3200 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

(2000, 0.42434610468149186)

In [37]:
result, model_outputs, wrong_predictions = model.eval_model(validation_dataset)

  0%|          | 0/800 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/100 [00:00<?, ?it/s]

In [38]:
predictions = np.argmax(model_outputs,axis=1)

In [39]:
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(list(validation_dataset['labels']), predictions)*100)


Accuracy: 72.25


roBERTa

In [45]:
!rm -rf outputs/

In [46]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
model_args = ClassificationArgs(num_train_epochs=5)
import sklearn

model = ClassificationModel("roberta", "roberta-base", args = model_args,num_labels=2,use_cuda=True)

model.train_model(train_dataset, acc=sklearn.metrics.accuracy_score)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

  0%|          | 0/3200 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

(2000, 0.4427461495399475)

In [47]:
result, model_outputs, wrong_predictions = model.eval_model(validation_dataset)

  0%|          | 0/800 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/100 [00:00<?, ?it/s]

In [48]:
import numpy as np
predictions = np.argmax(model_outputs,axis=1)
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(list(validation_dataset['labels']), predictions)*100)

Accuracy: 77.625


DeBERTa

In [49]:
!rm -rf outputs/

In [50]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
model_args = ClassificationArgs(num_train_epochs=5)


model = ClassificationModel("deberta", "microsoft/deberta-base", args = model_args,num_labels=2,use_cuda=True)

model.train_model(train_dataset, acc=sklearn.metrics.accuracy_score)

Downloading (…)lve/main/config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'classifi

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

  0%|          | 0/3200 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

(2000, 0.3784266908561694)

In [51]:
result, model_outputs, wrong_predictions = model.eval_model(validation_dataset)

  0%|          | 0/800 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/100 [00:00<?, ?it/s]

In [52]:
import numpy as np
predictions = np.argmax(model_outputs,axis=1)
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(list(validation_dataset['labels']), predictions)*100)

Accuracy: 77.125


#Passage

In [53]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [54]:
np.random.seed(500)

In [78]:
def load_dataset(file_name):
    import pandas as pd
    import json
    
    df = []
    with open(file_name,'r', encoding='utf-8') as f:
        for i in f:
            i = eval(i)
            tweet = i['postText']
            label = i['tags']
            
            assert len(tweet) == 1
            tweet = tweet[0]
            
            assert len(label) == 1
            label = label[0]
            
            if label not in ['phrase', 'phrases', 'passage', 'multi']:
                print(label)
                
            assert label in ['phrase', 'phrases', 'passage', 'multi']
            
            df += [{'text': tweet, 'labels': (label == 'passage')}]

    return pd.DataFrame(df)
train_dataset = load_dataset('train.jsonl')
validation_dataset = load_dataset('validation.jsonl')

In [79]:
train_dataset.head()

Unnamed: 0,text,labels
0,"Wes Welker Wanted Dinner With Tom Brady, But P...",True
1,NASA sets date for full recovery of ozone hole,False
2,This is what makes employees happy -- and it's...,False
3,Passion is overrated — 7 work habits you need ...,False
4,The perfect way to cook rice so that it's perf...,False


In [80]:
train_dataset['text'].dropna(inplace=True)
train_dataset['text'] = [entry.lower() for entry in train_dataset['text']]
train_dataset['text']= [word_tokenize(entry) for entry in train_dataset['text']]
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(train_dataset['text']):
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    train_dataset.loc[index,'text_final'] = str(Final_words)

In [81]:
validation_dataset['text'].dropna(inplace=True)
validation_dataset['text'] = [entry.lower() for entry in validation_dataset['text']]
validation_dataset['text']= [word_tokenize(entry) for entry in validation_dataset['text']]
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(validation_dataset['text']):
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    validation_dataset.loc[index,'text_final'] = str(Final_words)

In [82]:
Train_X, Test_X, Train_Y, Test_Y = train_dataset['text_final'],validation_dataset['text_final'],train_dataset['labels'],validation_dataset['labels']

In [83]:
Train_X.head()

0    ['wes', 'welker', 'want', 'dinner', 'tom', 'br...
1    ['nasa', 'set', 'date', 'full', 'recovery', 'o...
2            ['make', 'employee', 'happy', 'paycheck']
3    ['passion', 'overrate', 'work', 'habit', 'need...
4    ['perfect', 'way', 'cook', 'rice', 'perfectly'...
Name: text_final, dtype: object

In [61]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [62]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(pd.concat([train_dataset['text_final'],validation_dataset['text_final']]))
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [63]:
print(Tfidf_vect.vocabulary_)



In [64]:
print(Train_X_Tfidf)

  (0, 4866)	0.36140951627409035
  (0, 4862)	0.26681309018944405
  (0, 4861)	0.36140951627409035
  (0, 4810)	0.20801348707138728
  (0, 4470)	0.28612205584927547
  (0, 3341)	0.34437238667907366
  (0, 3004)	0.36140951627409035
  (0, 2183)	0.2690849262542588
  (0, 1446)	0.3322843508591913
  (0, 637)	0.3322843508591913
  (1, 3826)	0.3952314576815434
  (1, 3442)	0.4756948917078287
  (1, 2852)	0.42651594573361046
  (1, 2139)	0.3952314576815434
  (1, 1965)	0.4187655034738572
  (1, 1308)	0.3225184659150115
  (2, 3009)	0.6273580300407333
  (2, 2602)	0.2958519304484588
  (2, 2080)	0.48213854935885586
  (2, 1676)	0.5351971194013183
  (3, 4936)	0.43967956144531756
  (3, 2869)	0.43339259418065684
  (3, 2230)	0.5562610639603557
  (3, 2064)	0.5562610639603557
  (4, 4840)	0.24899370077565605
  :	:
  (3196, 2944)	0.29406853291565777
  (3196, 2850)	0.40178523228390006
  (3196, 2075)	0.34276169051724903
  (3196, 1616)	0.2842308562402412
  (3196, 1131)	0.32288417612709586
  (3196, 915)	0.3876819113800853
 

In [65]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
predictions_NB = Naive.predict(Test_X_Tfidf)
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  62.5


In [66]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
predictions_SVM = SVM.predict(Test_X_Tfidf)
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  65.0


In [67]:
logreg = LogisticRegression()
logreg.fit(Train_X_Tfidf,Train_Y)
predictions_log = logreg.predict(Test_X_Tfidf)
print("Log Accuracy Score -> ",accuracy_score(predictions_log, Test_Y)*100)

Log Accuracy Score ->  64.625


BERTa

In [84]:
!rm -rf outputs/

In [85]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
model_args = ClassificationArgs(num_train_epochs=5)


model = ClassificationModel("bert", "bert-base-uncased", args = model_args,num_labels=2,use_cuda=True)

model.train_model(train_dataset, acc=sklearn.metrics.accuracy_score)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

  0%|          | 0/3200 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

(2000, 0.4717913067340851)

In [86]:
result, model_outputs, wrong_predictions = model.eval_model(validation_dataset)

  0%|          | 0/800 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/100 [00:00<?, ?it/s]

In [87]:
import numpy as np
predictions = np.argmax(model_outputs,axis=1)
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(list(validation_dataset['labels']), predictions)*100) 

Accuracy: 70.75


roBERTa

In [88]:
!rm -rf outputs/

In [89]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
model_args = ClassificationArgs(num_train_epochs=5)


model = ClassificationModel("roberta", "roberta-base", args = model_args,num_labels=2,use_cuda=True)

model.train_model(train_dataset, acc=sklearn.metrics.accuracy_score)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

  0%|          | 0/3200 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

(2000, 0.6771468334197998)

In [90]:
result, model_outputs, wrong_predictions = model.eval_model(validation_dataset)

  0%|          | 0/800 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/100 [00:00<?, ?it/s]

In [91]:
import numpy as np
predictions = np.argmax(model_outputs,axis=1)
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(list(validation_dataset['labels']), predictions)*100)

Accuracy: 59.75


DeBERTa

In [92]:
!rm -rf outputs/

In [93]:
import sklearn
from simpletransformers.classification import ClassificationModel, ClassificationArgs
model_args = ClassificationArgs(num_train_epochs=5)


model = ClassificationModel("deberta", "microsoft/deberta-base", args = model_args,num_labels=2,use_cuda=True)

model.train_model(train_dataset, acc=sklearn.metrics.accuracy_score)

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'classifi

  0%|          | 0/3200 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

(2000, 0.6791068793237209)

In [94]:
result, model_outputs, wrong_predictions = model.eval_model(validation_dataset)

  0%|          | 0/800 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/100 [00:00<?, ?it/s]

In [95]:
import numpy as np
predictions = np.argmax(model_outputs,axis=1)
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(list(validation_dataset['labels']), predictions)*100)

Accuracy: 59.75


#Multi

In [96]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [97]:
np.random.seed(500)

In [98]:
def load_dataset(file_name):
    import pandas as pd
    import json
    
    df = []
    with open(file_name,'r', encoding='utf-8') as f:
        for i in f:
            i = eval(i)
            tweet = i['postText']
            label = i['tags']
            
            assert len(tweet) == 1
            tweet = tweet[0]
            
            assert len(label) == 1
            label = label[0]
            
            if label not in ['phrase', 'phrases', 'passage', 'multi']:
                print(label)
                
            assert label in ['phrase', 'phrases', 'passage', 'multi']
            
            df += [{'text': tweet, 'labels': (label == 'multi')}]

    return pd.DataFrame(df) 
train_dataset = load_dataset('train.jsonl')
validation_dataset = load_dataset('validation.jsonl')

In [99]:
train_dataset.head()

Unnamed: 0,text,labels
0,"Wes Welker Wanted Dinner With Tom Brady, But P...",False
1,NASA sets date for full recovery of ozone hole,False
2,This is what makes employees happy -- and it's...,False
3,Passion is overrated — 7 work habits you need ...,True
4,The perfect way to cook rice so that it's perf...,False


In [100]:
train_dataset['text'].dropna(inplace=True)
train_dataset['text'] = [entry.lower() for entry in train_dataset['text']]
train_dataset['text']= [word_tokenize(entry) for entry in train_dataset['text']]
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(train_dataset['text']):
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    train_dataset.loc[index,'text_final'] = str(Final_words)

In [101]:
validation_dataset['text'].dropna(inplace=True)
validation_dataset['text'] = [entry.lower() for entry in validation_dataset['text']]
validation_dataset['text']= [word_tokenize(entry) for entry in validation_dataset['text']]
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(validation_dataset['text']):
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    validation_dataset.loc[index,'text_final'] = str(Final_words)

In [102]:
Train_X, Test_X, Train_Y, Test_Y = train_dataset['text_final'],validation_dataset['text_final'],train_dataset['labels'],validation_dataset['labels']

In [103]:
Train_X.head()

0    ['wes', 'welker', 'want', 'dinner', 'tom', 'br...
1    ['nasa', 'set', 'date', 'full', 'recovery', 'o...
2            ['make', 'employee', 'happy', 'paycheck']
3    ['passion', 'overrate', 'work', 'habit', 'need...
4    ['perfect', 'way', 'cook', 'rice', 'perfectly'...
Name: text_final, dtype: object

In [104]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [105]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(pd.concat([train_dataset['text_final'],validation_dataset['text_final']]))
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [106]:
print(Tfidf_vect.vocabulary_)



In [107]:
print(Train_X_Tfidf)

  (0, 4866)	0.36140951627409035
  (0, 4862)	0.26681309018944405
  (0, 4861)	0.36140951627409035
  (0, 4810)	0.20801348707138728
  (0, 4470)	0.28612205584927547
  (0, 3341)	0.34437238667907366
  (0, 3004)	0.36140951627409035
  (0, 2183)	0.2690849262542588
  (0, 1446)	0.3322843508591913
  (0, 637)	0.3322843508591913
  (1, 3826)	0.3952314576815434
  (1, 3442)	0.4756948917078287
  (1, 2852)	0.42651594573361046
  (1, 2139)	0.3952314576815434
  (1, 1965)	0.4187655034738572
  (1, 1308)	0.3225184659150115
  (2, 3009)	0.6273580300407333
  (2, 2602)	0.2958519304484588
  (2, 2080)	0.48213854935885586
  (2, 1676)	0.5351971194013183
  (3, 4936)	0.43967956144531756
  (3, 2869)	0.43339259418065684
  (3, 2230)	0.5562610639603557
  (3, 2064)	0.5562610639603557
  (4, 4840)	0.24899370077565605
  :	:
  (3196, 2944)	0.29406853291565777
  (3196, 2850)	0.40178523228390006
  (3196, 2075)	0.34276169051724903
  (3196, 1616)	0.2842308562402412
  (3196, 1131)	0.32288417612709586
  (3196, 915)	0.3876819113800853
 

In [108]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
predictions_NB = Naive.predict(Test_X_Tfidf)
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  82.125


In [109]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
predictions_SVM = SVM.predict(Test_X_Tfidf)
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  82.125


In [110]:
logreg = LogisticRegression()
logreg.fit(Train_X_Tfidf,Train_Y)
predictions_log = logreg.predict(Test_X_Tfidf)
print("Log Accuracy Score -> ",accuracy_score(predictions_log, Test_Y)*100)

Log Accuracy Score ->  82.5


BERTa

In [111]:
!rm -rf outputs/

In [113]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
model_args = ClassificationArgs(num_train_epochs=5)


model = ClassificationModel("bert", "bert-base-uncased", args = model_args,num_labels=2,use_cuda=True)

model.train_model(train_dataset, acc=sklearn.metrics.accuracy_score)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

  0%|          | 0/3200 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

(2000, 0.31672625148296357)

In [114]:
result, model_outputs, wrong_predictions = model.eval_model(validation_dataset)
import numpy as np
predictions = np.argmax(model_outputs,axis=1)
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(list(validation_dataset['labels']), predictions)*100)

  0%|          | 0/800 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/100 [00:00<?, ?it/s]

Accuracy: 85.5


roBERTa

In [115]:
!rm -rf outputs/

In [116]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
model_args = ClassificationArgs(num_train_epochs=5)


model = ClassificationModel("roberta", "roberta-base", args = model_args,num_labels=2,use_cuda=True)

model.train_model(train_dataset, acc=sklearn.metrics.accuracy_score)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

  0%|          | 0/3200 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

(2000, 0.40514125394821165)

In [117]:
result, model_outputs, wrong_predictions = model.eval_model(validation_dataset)

  0%|          | 0/800 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/100 [00:00<?, ?it/s]

In [118]:
import numpy as np
predictions = np.argmax(model_outputs,axis=1)
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(list(validation_dataset['labels']), predictions)*100)

Accuracy: 87.125


DeBERTa

In [119]:
!rm -rf outputs/

In [120]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
model_args = ClassificationArgs(num_train_epochs=5)


model = ClassificationModel("deberta", "microsoft/deberta-base", args = model_args,num_labels=2,use_cuda=True)

model.train_model(train_dataset, acc=sklearn.metrics.accuracy_score)

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'classifi

  0%|          | 0/3200 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

(2000, 0.361561812158674)

In [121]:
result, model_outputs, wrong_predictions = model.eval_model(validation_dataset)

  0%|          | 0/800 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/100 [00:00<?, ?it/s]

In [122]:
import numpy as np
predictions = np.argmax(model_outputs,axis=1)
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(list(validation_dataset['labels']), predictions)*100)

Accuracy: 85.625


#MultiClass

In [2]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
np.random.seed(500)

In [4]:
def load_dataset(file_name):
    import pandas as pd
    import json
    
    df = []
    with open(file_name,'r', encoding='utf-8') as f:
        for i in f:
            i = eval(i)
            tweet = i['postText']
            label = i['tags']
            
            assert len(tweet) == 1
            tweet = tweet[0]
            
            assert len(label) == 1
            label = label[0]
            
            if label not in ['phrase', 'phrases', 'passage', 'multi']:
                print(label)
                
            assert label in ['phrase', 'phrases', 'passage', 'multi']
            
            df += [{'text': tweet, 'labels': label}]

    return pd.DataFrame(df)
train_dataset = load_dataset('train.jsonl')
validation_dataset = load_dataset('validation.jsonl')

In [127]:
train_dataset.head()

Unnamed: 0,text,labels
0,"Wes Welker Wanted Dinner With Tom Brady, But P...",passage
1,NASA sets date for full recovery of ozone hole,phrase
2,This is what makes employees happy -- and it's...,phrase
3,Passion is overrated — 7 work habits you need ...,multi
4,The perfect way to cook rice so that it's perf...,phrase


In [128]:
train_dataset['text'].dropna(inplace=True)
train_dataset['text'] = [entry.lower() for entry in train_dataset['text']]
train_dataset['text']= [word_tokenize(entry) for entry in train_dataset['text']]
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(train_dataset['text']):
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    train_dataset.loc[index,'text_final'] = str(Final_words)

In [129]:
validation_dataset['text'].dropna(inplace=True)
validation_dataset['text'] = [entry.lower() for entry in validation_dataset['text']]
validation_dataset['text']= [word_tokenize(entry) for entry in validation_dataset['text']]
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(validation_dataset['text']):
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    validation_dataset.loc[index,'text_final'] = str(Final_words)

In [130]:
Train_X, Test_X, Train_Y, Test_Y = train_dataset['text_final'],validation_dataset['text_final'],train_dataset['labels'],validation_dataset['labels']

In [131]:
Train_X.head()

0    ['wes', 'welker', 'want', 'dinner', 'tom', 'br...
1    ['nasa', 'set', 'date', 'full', 'recovery', 'o...
2            ['make', 'employee', 'happy', 'paycheck']
3    ['passion', 'overrate', 'work', 'habit', 'need...
4    ['perfect', 'way', 'cook', 'rice', 'perfectly'...
Name: text_final, dtype: object

In [132]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [133]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(pd.concat([train_dataset['text_final'],validation_dataset['text_final']]))
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [134]:
print(Tfidf_vect.vocabulary_)



In [135]:
print(Train_X_Tfidf)

  (0, 4866)	0.36140951627409035
  (0, 4862)	0.26681309018944405
  (0, 4861)	0.36140951627409035
  (0, 4810)	0.20801348707138728
  (0, 4470)	0.28612205584927547
  (0, 3341)	0.34437238667907366
  (0, 3004)	0.36140951627409035
  (0, 2183)	0.2690849262542588
  (0, 1446)	0.3322843508591913
  (0, 637)	0.3322843508591913
  (1, 3826)	0.3952314576815434
  (1, 3442)	0.4756948917078287
  (1, 2852)	0.42651594573361046
  (1, 2139)	0.3952314576815434
  (1, 1965)	0.4187655034738572
  (1, 1308)	0.3225184659150115
  (2, 3009)	0.6273580300407333
  (2, 2602)	0.2958519304484588
  (2, 2080)	0.48213854935885586
  (2, 1676)	0.5351971194013183
  (3, 4936)	0.43967956144531756
  (3, 2869)	0.43339259418065684
  (3, 2230)	0.5562610639603557
  (3, 2064)	0.5562610639603557
  (4, 4840)	0.24899370077565605
  :	:
  (3196, 2944)	0.29406853291565777
  (3196, 2850)	0.40178523228390006
  (3196, 2075)	0.34276169051724903
  (3196, 1616)	0.2842308562402412
  (3196, 1131)	0.32288417612709586
  (3196, 915)	0.3876819113800853
 

In [136]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
predictions_NB = Naive.predict(Test_X_Tfidf)
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  53.75


In [137]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
predictions_SVM = SVM.predict(Test_X_Tfidf)
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  56.375


In [138]:
logreg = LogisticRegression()
logreg.fit(Train_X_Tfidf,Train_Y)
predictions_log = logreg.predict(Test_X_Tfidf)
print("Log Accuracy Score -> ",accuracy_score(predictions_log, Test_Y)*100)

Log Accuracy Score ->  55.875


In [5]:
def multifun(a):
  if a =='passage':
    return 1
  elif a == 'phrase' or a=='phrases':
    return 0
  else:
    return 2  

In [None]:
train_dataset.drop('text_final', inplace=True, axis=1)

In [158]:
validation_dataset.drop('text_final', inplace=True, axis=1)

In [6]:
train_dataset['labels'] = train_dataset['labels'].apply(multifun)

In [7]:
validation_dataset['labels'] = validation_dataset['labels'].apply(multifun)

BERTa

In [12]:
!rm -rf outputs/

In [13]:
import sklearn
from simpletransformers.classification import ClassificationModel, ClassificationArgs
model_args = ClassificationArgs(num_train_epochs=5)
model = ClassificationModel("bert", "bert-base-cased", args = model_args,num_labels=3,use_cuda=True)
model.train_model(train_dataset, acc=sklearn.metrics.accuracy_score)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

  0%|          | 0/3200 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

(2000, 0.4700396272316575)

In [14]:
result, model_outputs, wrong_predictions = model.eval_model(validation_dataset)

  0%|          | 0/800 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/100 [00:00<?, ?it/s]

In [15]:
import numpy as np
predictions = np.argmax(model_outputs,axis=1)
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(list(validation_dataset['labels']), predictions)*100)

Accuracy: 68.75


roBERTa

In [8]:
!rm -rf outputs/
!rm -rf runs/
!rm -rf chache_dir/

In [10]:
import sklearn
from simpletransformers.classification import ClassificationModel, ClassificationArgs
model_args = ClassificationArgs(num_train_epochs=5)
model = ClassificationModel("roberta", "roberta-base", args = model_args,num_labels=3,use_cuda=True)
model.train_model(train_dataset, acc=sklearn.metrics.accuracy_score)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

  0%|          | 0/3200 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

(2000, 1.0433710861206054)

In [11]:
result, model_outputs, wrong_predictions = model.eval_model(validation_dataset)

  0%|          | 0/800 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/100 [00:00<?, ?it/s]

In [12]:
import numpy as np
predictions = np.argmax(model_outputs,axis=1)
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(list(validation_dataset['labels']), predictions)*100)

Accuracy: 41.875


DeBERTa

In [8]:
!rm -rf outputs/

In [11]:
import sklearn
from simpletransformers.classification import ClassificationModel, ClassificationArgs
model_args = ClassificationArgs(num_train_epochs=5)
model = ClassificationModel("deberta", "microsoft/deberta-base", args = model_args,num_labels=3,use_cuda=True)
model.train_model(train_dataset, acc=sklearn.metrics.accuracy_score)

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'classifi

  0%|          | 0/3200 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/400 [00:00<?, ?it/s]

(2000, 0.4989971442274982)

In [12]:
result, model_outputs, wrong_predictions = model.eval_model(validation_dataset)

  0%|          | 0/800 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/100 [00:00<?, ?it/s]

In [13]:
import numpy as np
predictions = np.argmax(model_outputs,axis=1)
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(list(validation_dataset['labels']), predictions)*100)

Accuracy: 70.375
