In [1]:
import numpy as np                               # linear algebra
import pandas as pd 

In [2]:
from nltk.tokenize import word_tokenize          
from nltk.stem import WordNetLemmatizer          
from nltk.corpus import stopwords                
import re                                        
from string import punctuation 
import random                                    
import matplotlib.pyplot as plt  

In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adita\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adita\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adita\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\adita\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
class GloVe_Embedder:
    def __init__(self, path):
        self.embedding_dict = {}
        self.embedding_array = []
        self.unk_emb = 0
        # Adapted from https://stackoverflow.com/questions/37793118/load-pretrained-GloVe-vectors-in-python
        with open(path,'r') as f:
            for line in f:
                split_line = line.split()
                word = split_line[0]
                embedding = np.array(split_line[1:], dtype=np.float64)
                self.embedding_dict[word] = embedding
                self.embedding_array.append(embedding.tolist())
        self.embedding_array = np.array(self.embedding_array)
        self.embedding_dim = len(self.embedding_array[0])
        self.vocab_size = len(self.embedding_array)
        self.unk_emb = np.zeros(self.embedding_dim)

    # Check if the provided embedding is the unknown embedding.
    def is_unk_embed(self, embed):
        return np.sum((embed - self.unk_emb) ** 2) < 1e-7
    
    # Check if the provided string is in the vocabulary.
    def token_in_vocab(self, x):
        if x in self.embedding_dict and not self.is_unk_embed(self.embedding_dict[x]):
            return True
        return False

    # Returns the embedding for a single string and prints a warning if
    # the string is unknown to the vocabulary.
    # 
    # If indicate_unk is set to True, the return type will be a tuple of 
    # (numpy array, bool) with the bool indicating whether the returned 
    # embedding is the unknown embedding.
    #
    # If warn_unk is set to False, the method will no longer print warnings
    # when used on unknown strings.
    def embed_str(self, x, indicate_unk = False, warn_unk = True):
        if self.token_in_vocab(x):
            if indicate_unk:
                return (self.embedding_dict[x], False)
            else:
                return self.embedding_dict[x]
        else:
            if warn_unk:
                    print("Warning: provided word is not part of the vocabulary!")
            if indicate_unk:
                return (self.unk_emb, True)
            else:
                return self.unk_emb

    # Returns an array containing the embeddings of each vocabulary token in the provided list.
    #
    # If include_unk is set to False, the returned list will not include any unknown embeddings.
    def embed_list(self, x, include_unk = True):
        if include_unk:
            embeds = [self.embed_str(word, warn_unk = False).tolist() for word in x]
        else:
            embeds_with_unk = [self.embed_str(word, indicate_unk=True, warn_unk = False) for word in x]
            embeds = [e[0].tolist() for e in embeds_with_unk if not e[1]]
            if len(embeds) == 0:
                print("No known words in input:" + str(x))
                embeds = [self.unk_emb.tolist()]
        return np.array(embeds)
    
    # Finds the vocab words associated with the k nearest embeddings of the provided word. 
    # Can also accept an embedding vector in place of a string word.
    # Return type is a nested list where each entry is a word in the vocab followed by its 
    # distance from whatever word was provided as an argument.
    def find_k_nearest(self, word, k, warn_about_unks = True):
        if type(word) == str:
            word_embedding, is_unk = self.embed_str(word, indicate_unk = True)
        else:
            word_embedding = word
            is_unk = False
        if is_unk and warn_about_unks:
            print("Warning: provided word is not part of the vocabulary!")

        all_distances = np.sum((self.embedding_array - word_embedding) ** 2, axis = 1) ** 0.5
        distance_vocab_index = [[w, round(d, 5)] for w,d,i in zip(self.embedding_dict.keys(), all_distances, range(len(all_distances)))]
        distance_vocab_index = sorted(distance_vocab_index, key = lambda x: x[1], reverse = False)
        return distance_vocab_index[:k]

    def save_to_file(self, path):
        with open(path, 'w') as f:
            for k in self.embedding_dict.keys():
                embedding_str = " ".join([str(round(s, 5)) for s in self.embedding_dict[k].tolist()])
                string = k + " " + embedding_str
                f.write(string + "\n")

In [5]:
ge = GloVe_Embedder("GloVe_Embedder_data.txt")

In [6]:
train_data = pd.read_csv("IA3-train.csv")
test_data = pd.read_csv("IA3-dev.csv")

In [7]:
def clean_text(text):
    '''Make text lowercase, remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('[%s]' % re.escape(punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

# Applying the cleaning function to both test and train datasets
train_data['text'] = train_data['text'].apply(lambda x: clean_text(x))
test_data['text'] = test_data['text'].apply(lambda x: clean_text(x))

In [8]:
# tokenizing the text

train_data['text'] = train_data['text'].apply(lambda x:word_tokenize(x))
test_data['text'] = test_data['text'].apply(lambda x:word_tokenize(x))

In [9]:
# removing stopwords (defined in nltk.corpus.stopwords)

def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words 

train_data['text'] = train_data['text'].apply(lambda x : remove_stopwords(x))
test_data['text'] = test_data['text'].apply(lambda x : remove_stopwords(x))

In [10]:
# lemmatizing the text entries

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in text]  ##Notice the use of text.

train_data['text'] = train_data['text'].apply(lambda x : lemmatize_text(x))
test_data['text'] = test_data['text'].apply(lambda x : lemmatize_text(x))

In [11]:
def concatenate_text(text):
    return ' '.join(text)

train_data['text'] = train_data['text'].apply(lambda x : concatenate_text(x))
test_data['text'] = test_data['text'].apply(lambda x : concatenate_text(x))

In [13]:
def train_val_split(df, validation_split):
    """
    This function generates the training and validation splits from an input dataframe
    
    Parameters:
        dataframe: pandas dataframe with columns "text" and "target" (binary)
        validation_split: should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the validation split
    
    Returns:
        train_samples: list of strings in the training dataset
        val_samples: list of strings in the validation dataset
        train_labels: list of labels (0 or 1) in the training dataset
        val_labels: list of labels (0 or 1) in the validation dataset      
    """
       
    text = df['text'].values.tolist()                         # input text as list
    targets = df['sentiment'].values.tolist()                    # targets
    
#   Preparing the training/validation datasets
    
    seed = random.randint(1,50)   # random integer in a range (1, 50)
    rng = np.random.RandomState(seed)
    rng.shuffle(text)
    rng = np.random.RandomState(seed)
    rng.shuffle(targets)

    num_validation_samples = int(validation_split * len(text))

    train_samples = text[:-num_validation_samples]
    val_samples = text[-num_validation_samples:]
    train_labels = targets[:-num_validation_samples]
    val_labels = targets[-num_validation_samples:]
    
    print(f"Total size of the dataset: {df.shape[0]}.")
    print(f"Training dataset: {len(train_samples)}.")
    print(f"Validation dataset: {len(val_samples)}.")
    
    return train_samples, val_samples, train_labels, val_labels


In [14]:
train_samples, val_samples, train_labels, val_labels = train_val_split(train_data, 0.1)

Total size of the dataset: 9000.
Training dataset: 8100.
Validation dataset: 900.


In [47]:
train_samples, val_samples, train_labels, val_labels = train_val_split(test_data, 0.1)

Total size of the dataset: 2500.
Training dataset: 2250.
Validation dataset: 250.


In [17]:
train_embedded_matrix = []
for i in range(0,len(train_data['text'])):
    word = train_data['text'][i].split()
    train_embedded_matrix.append(ge.embed_list(word))

In [32]:
print(len(train_embedded_matrix))

9000


In [49]:
test_embedded_matrix = []
for i in range(0,len(test_data['text'])):
    word = test_data['text'][i].split()
    test_embedded_matrix.append(ge.embed_list(word))

In [50]:
print(len(test_embedded_matrix))

2500


In [21]:
from statistics import mean

In [52]:
word_vec_average = []
for i in range(len(train_embedded_matrix)):
    vec_array = np.array(train_embedded_matrix[i])
    average = vec_array.mean(axis=0)
    word_vec_average.append(average)

In [51]:
test_word_vec_average = []
for i in range(len(test_embedded_matrix)):
    vec_array = np.array(train_embedded_matrix[i])
    average = vec_array.mean(axis=0)
    test_word_vec_average.append(average)

In [40]:
from sklearn import svm
from sklearn.metrics import classification_report

In [42]:
x_train = train_data['text']
y_train = train_data['sentiment']

In [46]:
linear_svm = svm.SVC(kernel ='linear', C = 10**1).fit(word_vec_average, y_train)
prediction1 = linear_svm.predict(word_vec_average)
report = classification_report(train_data['sentiment'], prediction1, output_dict=True)
print('positive: ', report['1'])
print('negative: ', report['0'])

positive:  {'precision': 0.8796007485963818, 'recall': 0.7692307692307693, 'f1-score': 0.820721769499418, 'support': 1833}
negative:  {'precision': 0.9428146545896985, 'recall': 0.9730710199525604, 'f1-score': 0.9577039274924471, 'support': 7167}


In [54]:
linear_svm = svm.SVC(kernel ='linear', C = 10**1)
training_model = linear_svm.fit(word_vec_average, y_train)
prediction_validation = linear_svm.predict(test_word_vec_average)
validation_report = classification_report(test_data['sentiment'], prediction_validation, output_dict=True)
print('positive: ', validation_report['1'])
print('negative: ', validation_report['0'])

positive:  {'precision': 0.18468468468468469, 'recall': 0.15648854961832062, 'f1-score': 0.1694214876033058, 'support': 524}
negative:  {'precision': 0.7850194552529183, 'recall': 0.8168016194331984, 'f1-score': 0.8005952380952381, 'support': 1976}


In [56]:
quadratic_svm = svm.SVC(kernel ='poly', degree=2, C = 10**1, coef0 = 40)
training_model = quadratic_svm.fit(word_vec_average, y_train)
prediction_validation = quadratic_svm.predict(test_word_vec_average)
validation_report = classification_report(test_data['sentiment'], prediction_validation, output_dict=True)
print('positive: ', validation_report['1'])
print('negative: ', validation_report['0'])

positive:  {'precision': 0.18888888888888888, 'recall': 0.16221374045801526, 'f1-score': 0.17453798767967144, 'support': 524}
negative:  {'precision': 0.7858536585365854, 'recall': 0.8152834008097166, 'f1-score': 0.8002980625931446, 'support': 1976}


In [57]:
rbf_svm = svm.SVC(kernel ='rbf', C = 11, gamma=0.2)
training_model = rbf_svm.fit(word_vec_average, y_train)
prediction_validation = rbf_svm.predict(test_word_vec_average)
validation_report = classification_report(test_data['sentiment'], prediction_validation, output_dict=True)
print('Accuracy: ', validation_report['accuracy'])
print('positive: ', validation_report['1'])
print('negative: ', validation_report['0'])

Accuracy:  0.6724
positive:  {'precision': 0.19206680584551147, 'recall': 0.17557251908396945, 'f1-score': 0.18344965104685942, 'support': 524}
negative:  {'precision': 0.7862444334487877, 'recall': 0.8041497975708503, 'f1-score': 0.7950963222416813, 'support': 1976}
