### Imports

In [7]:
# Import Data
import pandas as pd
import numpy as np
import os
import csv
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import nlpaug.augmenter.word as naw

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
pd.options.display.max_colwidth = 1000

### importing lazypredict library
import lazypredict
### importing LazyClassifier for classification problem
from lazypredict.Supervised import LazyClassifier
### importing LazyClassifier for classification problem because here we are solving Classification use case.
from lazypredict.Supervised import LazyClassifier
### importing breast Cancer Dataset from sklearn
from sklearn.datasets import load_breast_cancer
### spliting dataset into training and testing part
from sklearn.model_selection import train_test_split

import xgboost as xgb
from xgboost import XGBClassifier

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Utility Functions

In [12]:
def preProcess(data):
    print("[preProcess] start")
    print(data['sentence'][2800])
    data['sentence'] = [entry.lower() for entry in data['sentence']]
    data['sentence'] = [word_tokenize(entry) for entry in data['sentence']]
    print(data['sentence'][2800])
    
    # WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    for index,entry in enumerate(data['sentence']):
        # Declaring Empty List to store the words that follow the rules for this step
        finalWords = []
        # Initializing WordNetLemmatizer()
        wordLemmatized = WordNetLemmatizer()
        # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
        for word, tag in pos_tag(entry):
            # Below condition is to check for Stop words and consider only alphabets
            if word not in stopwords.words('english') and word.isalpha():
                word_Final = wordLemmatized.lemmatize(word,tag_map[tag[0]])
                finalWords.append(word_Final)
        # The final processed set of words for each iteration will be stored in 'text_final'
        data.loc[index,'sentence'] = str(finalWords)
    print("[preProcess] end")
    return data

def vectorizeAndSplit(allData):
    
    print("[vectorizeAndSplit] start")
    allData = preProcess(allData)
    display(allData)
    allData.reset_index(drop=True,inplace=True)
    trainingData = allData[allData['set'] == 'train']
    validationData = allData[allData['set'] == 'val']
    testingData = allData[allData['set'] == 'test']

    trainingData.reset_index(drop=True,inplace=True)
    validationData.reset_index(drop=True,inplace=True)
    testingData.reset_index(drop=True,inplace=True)
    assert( len(trainingData) + len(testingData) + len(validationData) == len(allData))
    
    Encoder = LabelEncoder()
    trainY = Encoder.fit_transform(trainingData['annotation'])
    testY = Encoder.fit_transform(testingData['annotation'])
    
    tfidVect = TfidfVectorizer(max_features=5000)
    tfidVect.fit(trainingData['sentence'])
    
    trainXVectorized = tfidVect.transform(trainingData['sentence'])
    testXVectorized = tfidVect.transform(testingData['sentence'])
    
    print("[vectorizeAndSplit] end")
    return trainXVectorized,testXVectorized,trainY,testY




### Load all data into pandas

In [3]:

allData = pd.DataFrame()
dataDirectory = 'data'
for filename in os.listdir(dataDirectory):
    f = os.path.join(dataDirectory, filename)
    # checking if it is a file
    if os.path.isfile(f):
        df = pd.read_csv(f,sep = '\t',quoting=csv.QUOTE_NONE)
        allData = pd.concat([allData,df])
allData.reset_index(inplace=True)

In [None]:
trainXVectorized,testXVectorized,trainY,testY = vectorizeAndSplit(allData)

In [5]:
#Naive Bayes
Naive = naive_bayes.MultinomialNB()
Naive.fit(trainXVectorized,trainY)
# predict the labels on validation dataset
predictionsNB = Naive.predict(testXVectorized)
# Use accuracy_score function to get the accuracy
print("Naive Bayes F1 Score -> ",f1_score(predictionsNB, testY,pos_label='positive',average='micro'))

Naive Bayes F1 Score ->  0.6265413975337639


In [6]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(trainXVectorized,trainY)
# predict the labels on validation dataset
predictionsSVM = SVM.predict(testXVectorized)
# Use accuracy_score function to get the accuracy
print("SVM F1 Score -> ",f1_score(predictionsSVM, testY,average='micro')*100)

SVM F1 Score ->  62.86944607555295


In [12]:

model = XGBClassifier() 
model.fit(trainXVectorized, trainY)
predXGBoost = model.predict(testXVectorized)
print("XGBoost F1 Score -> ",f1_score(predXGBoost, testY,average='micro')*100)

XGBoost F1 Score ->  63.28048541789


In [10]:
trainXVectorized.shape, trainY.shape

array([2, 0, 1, ..., 2, 0, 2], dtype=int64)

# With Augmented Data

In [8]:

aug = naw.BackTranslationAug()
text = 'The quick brown fox jumps over the lazy dog .'

print("ORIGINAL TEXT: ", text)
print("AUGMENTED TEXT: ",aug.augment(text))

Downloading (…)okenizer_config.json: 100%|██████████| 67.0/67.0 [00:00<00:00, 67.0kB/s]
Downloading (…)/main/vocab-src.json: 100%|██████████| 849k/849k [00:06<00:00, 132kB/s]
Downloading (…)/main/vocab-tgt.json: 100%|██████████| 849k/849k [00:03<00:00, 243kB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 315k/315k [00:01<00:00, 181kB/s]


ORIGINAL TEXT:  The quick brown fox jumps over the lazy dog .
AUGMENTED TEXT:  ['The speedy brown fox jumps over the lazy dog.']


In [20]:
text = 'The quick brown fox jumps over the lazy dog .'
syn_aug = naw.SynonymAug(aug_src='wordnet',aug_max=2)
syn_aug_text = syn_aug.augment(text,n=1)
print("ORIGINAL TEXT: ", text)
print("AUGMENTED TEXT: ",syn_aug_text)

ORIGINAL TEXT:  The quick brown fox jumps over the lazy dog .
AUGMENTED TEXT:  ['The quick john brown fox jump over the lazy dog.']


In [22]:
def backTranslationAugmentation(data):
    print('[backTranslationAugmentation] start')
    duplicatedDf = pd.concat([data]*2, ignore_index=True)
    
    for i in range( int ( len(duplicatedDf)/2 ) , len(duplicatedDf)):
        if(i==0):
            print('Duplicate : ',duplicatedDf.loc[i,'sentence'])
            print('Original : ',duplicatedDf.loc[i - int ( len(duplicatedDf)/2 ) ,'sentence'])
        duplicatedDf.loc[i,'sentence'] = aug.augment( duplicatedDf.loc[i - int ( len(duplicatedDf)/2 ) ,'sentence'] )[0]
        print('.',sep="")
    duplicatedDf.reset_index(drop=True,inplace=True)
    print('[backTranslationAugmentation] end')
    return duplicatedDf

def synonymAugmentation(data):
    print('[synonymAugmentation] start')
    duplicatedDf = pd.concat([data]*2, ignore_index=True)
    syn_aug = naw.SynonymAug(aug_src='wordnet',aug_max=2)
    once = True
    for i in range( int ( len(duplicatedDf)/2 ) , len(duplicatedDf)):
        if(once):
            print('Duplicate : ',duplicatedDf.loc[i,'sentence'])
            print('Original : ',duplicatedDf.loc[i - int ( len(duplicatedDf)/2 ) ,'sentence'])
            once = False
        duplicatedDf.loc[i,'sentence'] = syn_aug.augment(duplicatedDf.loc[i - int ( len(duplicatedDf)/2 ) ,'sentence'],n=1)[0]
        print('.',end="")
    duplicatedDf.reset_index(drop=True,inplace=True)
    print('[synonymAugmentation] end')
    return duplicatedDf

def vectorizeAugmentAndSplit(allData):
    
    print("[vectorizeAndSplit] start")
    allData = preProcess(allData)
    display(allData)
    allData.reset_index(drop=True,inplace=True)
    trainingData = allData[allData['set'] == 'train']
    validationData = allData[allData['set'] == 'val']
    testingData = allData[allData['set'] == 'test']

    trainingData.reset_index(drop=True,inplace=True)
    trainingData = synonymAugmentation(trainingData)
    validationData.reset_index(drop=True,inplace=True)
    testingData.reset_index(drop=True,inplace=True)
    assert( len(trainingData)/2 + len(testingData) + len(validationData) == len(allData))
    
    Encoder = LabelEncoder()
    trainY = Encoder.fit_transform(trainingData['annotation'])
    testY = Encoder.fit_transform(testingData['annotation'])
    
    tfidVect = TfidfVectorizer(max_features=5000)
    tfidVect.fit(trainingData['sentence'])
    
    trainXVectorized = tfidVect.transform(trainingData['sentence'])
    testXVectorized = tfidVect.transform(testingData['sentence'])
    
    print("[vectorizeAndSplit] end")
    return trainXVectorized,testXVectorized,trainY,testY

In [23]:

allData = pd.DataFrame()
dataDirectory = 'data'
for filename in os.listdir(dataDirectory):
    f = os.path.join(dataDirectory, filename)
    # checking if it is a file
    if os.path.isfile(f):
        df = pd.read_csv(f,sep = '\t',quoting=csv.QUOTE_NONE)
        allData = pd.concat([allData,df])
allData.reset_index(inplace=True)

In [None]:
trainXVectorized,testXVectorized,trainY,testY = vectorizeAugmentAndSplit(allData)

In [25]:
#Naive Bayes
Naive = naive_bayes.MultinomialNB()
Naive.fit(trainXVectorized,trainY)
# predict the labels on validation dataset
predictionsNB = Naive.predict(testXVectorized)
# Use accuracy_score function to get the accuracy
print("Naive Bayes F1 Score -> ",f1_score(predictionsNB, testY,pos_label='positive',average='micro'))
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(trainXVectorized,trainY)
# predict the labels on validation dataset
predictionsSVM = SVM.predict(testXVectorized)
# Use accuracy_score function to get the accuracy
print("SVM F1 Score -> ",f1_score(predictionsSVM, testY,average='micro')*100)
model = XGBClassifier() 
model.fit(trainXVectorized, trainY)
predXGBoost = model.predict(testXVectorized)
print("XGBoost F1 Score -> ",f1_score(predXGBoost, testY,average='micro')*100)


Naive Bayes F1 Score ->  0.6308475239772949
SVM F1 Score ->  61.79291446467019
XGBoost F1 Score ->  63.848111176355445
