In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import matplotlib.pyplot as plt
from json import JSONDecoder
from functools import partial
import json
from pprint import pprint
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer
import mxnet as mx

## DATA & PRE-PROCESSING

In [2]:
# Preprocessing steps
stemmer = LancasterStemmer()

def decodeHTMLencoding(tweets):
    decoded_tweets = tweets.applymap(lambda tweet: BeautifulSoup(tweet, 'lxml').get_text())
    return decoded_tweets

def removeStopWords(text):
    stopw = stopwords.words('english')
    words = [word for word in text.split() if len(word) > 3 and not word in stopw]
    # get stems from words
    for i in range(len(words)):
        words[i] = stemmer.stem(words[i])
    return (" ".join(words)).strip()

def cleanTweets(tweets):
    # decode tweets from html tags
    cleaned_tweets = decodeHTMLencoding(tweets)
    # remove URLs that starts with http
    cleaned_tweets = cleaned_tweets.applymap(lambda tweet: re.sub(
    r'https?:\/\/(www\.)?[-a-zA-Z0–9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0–9@:%_\+.~#?&//=]*)', '', tweet, flags=re.MULTILINE) )
    # remove URLs that does not start with http
    cleaned_tweets = cleaned_tweets.applymap(lambda tweet: re.sub(
    r'[-a-zA-Z0–9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0–9@:%_\+.~#?&//=]*)', '', tweet, flags=re.MULTILINE))
    # remove @
    cleaned_tweets = cleaned_tweets.applymap( lambda tweet: re.sub(r'@[A-Za-z0-9_]+', '', tweet, flags=re.MULTILINE) )
    # remove #
    cleaned_tweets = cleaned_tweets.applymap( lambda tweet: re.sub(r'#[A-Za-z0-9_]+', '', tweet, flags=re.MULTILINE) )
    # remove RT
    cleaned_tweets = cleaned_tweets.applymap( lambda tweet: re.sub('RT ', '', tweet, flags=re.MULTILINE) )
    # remove symbols and numbers (i.e keep letters only)
    cleaned_tweets = cleaned_tweets.applymap( lambda tweet: re.sub("[^a-zA-Z]"," ",tweet, flags=re.MULTILINE) )
    #replace consecutive non-ASCII characters with a space
    cleaned_tweets = cleaned_tweets.applymap( lambda tweet: re.sub(r'[^\x00-\x7F]+'," ",tweet.lower(), flags=re.MULTILINE) )
    
    cleaned_tweets.drop_duplicates(inplace=True)
    cleaned_tweets.replace('', np.nan, inplace=True)
    cleaned_tweets.dropna(inplace=True)
    
    return cleaned_tweets

In [3]:
def get_text_vectors(tweets, model):
    # dataset should be a pandas dataframe
    dimension = 300
    data_array = np.empty(shape=[0, dimension])
    indexes = []
    
    for i, tweet in enumerate(tweets):
        words = tweet.split()
        if len(words) !=0:
            feature = 0
            for word in words:
                try:
                    feature += model[word]
                except:
                    pass
            feature /= len(words)
            try:
                if feature.size == dimension:  
                    data_array = np.append(data_array, [feature], axis=0)
                    indexes.append(i)
            except:
                continue
    indexes = np.asarray(indexes)
    assert indexes.size == data_array.shape[0]
    return data_array, indexes

In [4]:
def remove_indices(weak_signals):
    # remove indexes of weak_signals that do not have coverage
    indices = np.where(np.sum(weak_signals, axis=1) == -1*weak_signals.shape[1])[0]
    weak_signals = np.delete(weak_signals, indices, axis=0)
    
    return weak_signals, indices

In [5]:
df = pd.read_csv('../datasets/glove.42B.300d.txt', sep=" ", quoting=3, header=None, index_col=0)
glove_model = {key: val.values for key, val in df.T.items()}

In [6]:
# test word vectors
from scipy import spatial
result = 1 - spatial.distance.cosine(glove_model['horrible'], glove_model['terrible'])
result

0.9358371614102348

In [29]:
def keyword_labeling(data, keywords, sentiment='pos'):
    mask = 1 if sentiment == 'pos' else 0
    weak_signals = []
    for terms in keywords:
        weak_signal = []
        for text in data:
            label=-1
            for word in terms:
                if word in text.lower():
                    label = mask
            weak_signal.append(label)
        weak_signals.append(weak_signal)
    return np.asarray(weak_signals).T

POSITIVE_LABELS =  [['good','great','nice','delight','wonderful'], 
                    ['love', 'best', 'genuine','well', 'thriller'], 
                    ['clever','enjoy','fine','deliver','fascinating'], 
                    ['super','excellent','charming','pleasure','strong'], 
                    ['fresh','comedy', 'interesting','fun','entertain', 'charm', 'clever'], 
                    ['amazing','romantic','intelligent','classic','stunning'],
                    ['rich','compelling','delicious', 'intriguing','smart']]

NEGATIVE_LABELS = [['bad','better','leave','never','disaster'], 
                   ['nothing','action','fail','suck','difficult'], 
                   ['mess','dull','dumb', 'bland','outrageous'], 
                   ['slow', 'terrible', 'boring', 'insult','weird','damn'],
                   ['drag','awful','waste', 'flat','worse'],
                   #['drag','no','not','awful','waste', 'flat'], 
                   ['horrible','ridiculous','stupid', 'annoying','painful'], 
                   ['poor','pathetic','pointless','offensive','silly']]

# YELP

In [72]:
datapath = '../datasets/yelp/'
size = 10000
review = pd.read_json(datapath+'yelp_review.json', lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize=size)

In [73]:
# There are multiple chunks to be read
count=0
chunk_list = []
for chunk_review in review:
    # Drop columns that aren't needed
    chunk_review = chunk_review.drop(['review_id','user_id','useful','funny','cool','business_id','date'], axis=1)
    chunk_list.append(chunk_review)
    count +=1
    if count==6:
        break
# After trimming down the review file, concatenate all relevant data back to one dataframe
df = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

In [74]:
csv_name = datapath+"yelp_reviews.csv"
df.to_csv(csv_name, index=False)
df.head()

Unnamed: 0,stars,text
0,2,"As someone who has worked with many museums, I..."
1,1,I am actually horrified this place is still in...
2,5,I love Deagan's. I do. I really do. The atmosp...
3,1,"Dismal, lukewarm, defrosted-tasting ""TexMex"" g..."
4,4,"Oh happy day, finally have a Canes near my cas..."


In [75]:
positive_labels = keyword_labeling(df.text.values, POSITIVE_LABELS, sentiment='pos')
negative_labels = keyword_labeling(df.text.values, NEGATIVE_LABELS, sentiment='neg')
weak_signals = np.hstack([positive_labels, negative_labels])
weak_signals, indices = remove_indices(weak_signals)
weak_signals.shape

(55392, 14)

In [76]:
df = df.drop(df.index[indices])
df.reset_index(drop=True, inplace=True)
train_data = df.text.values
train_labels = np.zeros(df.shape[0])
train_labels[df.stars.values >3]=1

In [77]:
train_data = cleanTweets(df.drop(columns=['stars']))
train_labels = train_labels[train_data.index]
weak_signals = weak_signals[train_data.index]
train_data.shape, train_labels.shape

((55370, 1), (55370,))

In [None]:
train_features, train_index = get_text_vectors(train_data.values.ravel(), glove_model)
train_features.shape, train_index.shape

In [None]:
# get test data
np.random.seed(5000)
test_indexes = np.random.choice(train_index.size, 10000, replace=False)
test_labels = train_labels[test_indexes]
test_data = train_features[test_indexes]

train_data = np.delete(train_features, test_indexes, axis=0)
weak_signals = np.delete(weak_signals, test_indexes, axis=0)
train_labels = np.delete(train_labels, test_indexes)

train_data.shape,train_labels.shape,weak_signals.shape,test_labels.shape

In [None]:
# save the weak_signals signals
np.save(datapath+'weak_signals.npy', weak_signals)

# save yelp data
np.save(datapath+'data_features.npy', train_data)
np.save(datapath+'test_features.npy', test_data)

# save yelp labels
np.save(datapath+'data_labels.npy', train_labels)
np.save(datapath+'test_labels.npy', test_labels)

In [81]:
train_data.shape,train_labels.shape,weak_signals.shape,test_labels.shape

((45370, 300), (45370,), (45370, 14), (10000,))

# SST-2

In [35]:
datapath = '../datasets/sst-2/'
train_data = pd.read_csv(datapath+'sst2-train.csv')
test_data = pd.read_csv(datapath+'sst2-test.csv')
train_data.head()

Unnamed: 0,label,sentence
0,1,"A stirring, funny and finally transporting re-..."
1,0,Apparently reassembled from the cutting-room f...
2,0,They presume their audience won't sit still fo...
3,1,This is a visually stunning rumination on love...
4,1,Jonathan Parker's Bartleby should have been th...


In [36]:
NEGATIVE_LABELS = [['bad','better','leave','never','disaster'], 
                   ['nothing','action','fail','suck','difficult'], 
                   ['mess','dull','dumb', 'bland','outrageous'], 
                   ['slow', 'terrible', 'boring', 'insult','weird','damn'],
                   # ['drag','awful','waste', 'flat','worse'],
                   ['drag','no','not','awful','waste', 'flat'], 
                   ['horrible','ridiculous','stupid', 'annoying','painful'], 
                   ['poor','pathetic','pointless','offensive','silly']]

positive_labels = keyword_labeling(train_data.sentence.values, POSITIVE_LABELS)
negative_labels = keyword_labeling(train_data.sentesnce.values, NEGATIVE_LABELS, sentiment='neg')
weak_signals = np.hstack([positive_labels, negative_labels])
weak_signals.shape

(6920, 14)

In [37]:
weak_signals, indices = remove_indices(train_data, weak_signals)
weak_signals.shape

(3998, 14)

In [38]:
train_labels = train_data.label.values
test_labels = test_data.label.values

n,m = weak_signals.shape
weak_signal_probabilities = weak_signals.T.reshape(m,n,1)

weak_signals_mask = weak_signal_probabilities >=0

from model_utilities import get_error_bounds
true_error_rates = get_error_bounds(train_labels, weak_signal_probabilities, weak_signals_mask)
print("error: ", np.asarray(true_error_rates))

error:  [[0.30916844]
 [0.29194631]
 [0.26710098]
 [0.29081633]
 [0.36492375]
 [0.31952663]
 [0.19417476]
 [0.34623218]
 [0.32853026]
 [0.2513369 ]
 [0.33333333]
 [0.44829801]
 [0.15116279]
 [0.18348624]]


In [39]:
# Clean data and reset index
train_data.reset_index(drop=True, inplace=True)

# apply on train data
train_data = cleanTweets(train_data.drop(columns=['label']))
train_data = post_process_tweets(train_data)

# apply on test data
test_data = cleanTweets(test_data.drop(columns=['label']))
test_data = post_process_tweets(test_data)

print(train_data[0].shape, train_labels.shape)
print(test_data[0].shape, test_labels.shape)

(3998, 1) (3998,)
(1821, 1) (1821,)


In [18]:
train_features, train_index = get_text_vectors(train_data[0].values.ravel(), glove_model)
test_features, test_index = get_text_vectors(test_data[0].values.ravel(), glove_model)

# save sst-2 data
np.save(datapath+'data_features.npy', train_features)
np.save(datapath+'test_features.npy', test_features)

indexes = train_data[1]
indexes = indexes[train_index]
# save sst-2 labels
np.save(datapath+'data_labels.npy', train_labels[indexes])
np.save(datapath+'test_labels.npy', test_labels[test_data[1]])

# save the one-hot signals
np.save(datapath+'weak_signals.npy', weak_signals[indexes])

0       a stirring  funny and finally transporting re ...
1       they presume their audience won t sit still fo...
2       this is a visually stunning rumination on love...
3       campanella gets the tone just right    funny i...
4       a fan film that for the uninitiated plays bett...
                              ...                        
3600    painful  horrifying and oppressively tragic  t...
3601    take care is nicely performed by a quintet of ...
3602    the script covers huge  heavy topics in a blan...
3603    a seriously bad film with seriously warped log...
3604    a deliciously nonsensical comedy about a city ...
Name: sentence, Length: 3605, dtype: object

# IMDB Dataset

In [102]:
datapath = '../datasets/imdb/'
df = pd.read_csv(datapath+'IMDB Dataset.csv')

# apply on train data
cleaned_data = cleanTweets(df.drop(columns=['sentiment']))
indexes = cleaned_data.index.values
df.shape, indexes.size

((50000, 2), 49580)

In [103]:
n = indexes.size
# get test data
np.random.seed(50)
test_indexes = np.random.choice(indexes, int(n*0.2), replace=False)
test_labels = np.zeros(test_indexes.size)
test_labels[df.sentiment.values[test_indexes]=='positive'] = 1
test_data = df.review.values[test_indexes]

train_indexes = np.delete(indexes, [np.where(indexes == i)[0][0] for i in test_indexes])
train_labels = np.zeros(train_indexes.size)
train_labels[df.sentiment.values[train_indexes]=='positive'] = 1
train_data = df.review.values[train_indexes]

print(train_data.shape, train_labels.shape)
print(test_data.shape, test_labels.shape)

(39664,) (39664,)
(9916,) (9916,)


In [104]:
positive_labels = keyword_labeling(train_data, [['good'],['wonderful'],['great'],['amazing'],['excellent']], sentiment='pos')
negative_labels = keyword_labeling(train_data, [['bad'],['horrible'],['sucks'],['awful'],['terrible']], sentiment='neg')
weak_signals = np.hstack([positive_labels, negative_labels])
weak_signals, indices = remove_indices(weak_signals)
weak_signals.shape

(29187, 10)

In [106]:
# add signals not covered to test data
test_data = np.append(test_data, train_data[indices])
test_labels = np.append(test_labels, train_labels[indices])

# delete train data not covered by weak signals
train_data = np.delete(train_data, indices, axis=0)
train_labels = np.delete(train_labels, indices)

# get data features
train_features, train_index = get_text_vectors(train_data, glove_model)
test_features, test_index = get_text_vectors(test_data, glove_model)

print(train_index.size, train_data.shape[0])
test_index.size, test_labels.size

29182 29187


(20392, 20393)

In [107]:
# save imdb data
np.save(datapath+'data_features.npy', train_features)
np.save(datapath+'test_features.npy', test_features)

# save imdb labels
np.save(datapath+'data_labels.npy', train_labels[train_index])
np.save(datapath+'test_labels.npy', test_labels[test_index])

# save the weak_signals
np.save(datapath+'weak_signals.npy', weak_signals[train_index])