In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt 
import re
import demoji
demoji.download_codes()
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud,STOPWORDS
import seaborn as sns

[33mDownloading emoji data ...[0m
[92m... OK[0m (Got response in 0.30 seconds)
[33mWriting emoji data to /Users/Liuzhaoyu/.demoji/codes.json ...[0m
[92m... OK[0m


In [2]:
# some additional imports
# from autocorrect import speller
stopwordsList = stopwords.words('english')
# ended up lemmatizing instead of stemming
from nltk.stem import PorterStemmer
st = PorterStemmer()
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [3]:
def parse_dataset(fp):
    '''
    Loads the dataset file with label-tweet on each line and parses the dataset.
    :param fp: filepath of dataset
    :return:
        corpus: list of tweet strings of each tweet.
        y: list of labels
    '''
    y = []
    corpus = []
    with open(fp, 'rt') as data_in:
        for line in data_in:
            if not line.lower().startswith("tweet index"): # discard first line if it contains metadata
                line = line.rstrip() # remove trailing whitespace
                label = int(line.split("\t")[1])
                tweet = line.split("\t")[2]
                y.append(label)
                corpus.append(tweet)

    return corpus, y

In [4]:
# training data
train_data, train_label = parse_dataset('datasets/train/SemEval2018-T3-train-taskA_emoji.txt')
df_train = pd.DataFrame(np.array(train_data).reshape(3834,1), columns = ['tweet'])
df_train['label'] = np.array(train_label).reshape(3834,1)
df_train.head()

Unnamed: 0,tweet,label
0,Sweet United Nations video. Just in time for C...,1
1,@mrdahl87 We are rumored to have talked to Erv...,1
2,Hey there! Nice to see you Minnesota/ND Winter...,1
3,3 episodes left I'm dying over here,0
4,"""I can't breathe!"" was chosen as the most nota...",1


In [5]:
# testing data
test_data, test_label = parse_dataset('datasets/goldtest_TaskA/SemEval2018-T3_gold_test_taskA_emoji.txt')
df_test = pd.DataFrame(np.array(test_data).reshape(784,1), columns = ['tweet'])
df_test['label'] = np.array(test_label).reshape(784,1)
df_test.head()

Unnamed: 0,tweet,label
0,@Callisto1947 Can U Help?||More conservatives ...,0
1,"Just walked in to #Starbucks and asked for a ""...",1
2,#NOT GONNA WIN http://t.co/Mc9ebqjAqj,0
3,@mickymantell He is exactly that sort of perso...,0
4,So much #sarcasm at work mate 10/10 #boring 10...,1


# Project Preperation

## Setup project

In [6]:
import numpy as np 
import pandas as pd 
import re
import nltk 
import matplotlib.pyplot as plt
%matplotlib inline

## Clean and augment data

In [7]:
# Get users' texts as well as corresponding sentiments
texts_train = df_train.iloc[:, 0].values
texts_test = df_test.iloc[:, 0].values
sentiments_train = df_train.iloc[:, 1].values
sentiments_test = df_test.iloc[:, 1].values

In [8]:
# Clean texts
clean_texts_train = []
clean_texts_test = []
for text in texts_train:
    # remove all special characters
    clean_text = re.sub(r'\W', ' ', str(text))
    # remove all single character
    clean_text = re.sub(r'\s+[a-zA-Z]\s+', ' ', clean_text)
    # replace multiple spaces to single space
    clean_text = re.sub(r'\s+', ' ', clean_text, flags=re.I)
    # to lower case
    clean_text = clean_text.lower()
    clean_texts_train.append(clean_text)
    
for text in texts_test:
    # remove all special characters
    clean_text = re.sub(r'\W', ' ', str(text))
    # remove all single character
    clean_text = re.sub(r'\s+[a-zA-Z]\s+', ' ', clean_text)
    # replace multiple spaces to single space
    clean_text = re.sub(r'\s+', ' ', clean_text, flags=re.I)
    # to lower case
    clean_text = clean_text.lower()
    clean_texts_test.append(clean_text)

In [9]:
# Combine positive and neutral
clean_sentiments_train = []
clean_sentiments_test = []
for sentiment in sentiments_train:
    if sentiment == "positive" or sentiment == "neutral":
        clean_sentiment = "non-negative"
    else:
        clean_sentiment = sentiment
    clean_sentiments_train.append(clean_sentiment)
    
for sentiment in sentiments_test:
    if sentiment == "positive" or sentiment == "neutral":
        clean_sentiment = "non-negative"
    else:
        clean_sentiment = sentiment
    clean_sentiments_test.append(clean_sentiment)

# Modelling

## Build the base model

In [10]:
# Split into training and testing data
X_train = clean_texts_train
X_test = clean_texts_test
y_train = clean_sentiments_train
y_test = clean_sentiments_test

## Evaluation metrics

## Build the model and train on the data set

In [11]:
words_train = [w for w in X_train if not w in stopwords.words("english")]
words_test = [w for w in X_test if not w in stopwords.words("english")]

In [12]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [13]:
# Set values for various parameters
num_features = 1000   # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-4   # Downsample setting for frequent words

In [14]:
# Apply word2vec to train the model
from gensim.models import word2vec

model = word2vec.Word2Vec(words_train, workers=num_workers, size=num_features, 
                          min_count = min_word_count, window = context, 
                          sample = downsampling)
model_name = "sentiment_tweet"
model.save(model_name)

2020-03-03 12:48:12,886 : INFO : collecting all words and their counts
2020-03-03 12:48:12,888 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-03 12:48:12,922 : INFO : collected 66 word types from a corpus of 308795 raw words and 3834 sentences
2020-03-03 12:48:12,923 : INFO : Loading a fresh vocabulary
2020-03-03 12:48:12,923 : INFO : effective_min_count=40 retains 38 unique words (57% of original 66, drops 28)
2020-03-03 12:48:12,924 : INFO : effective_min_count=40 leaves 308761 word corpus (99% of original 308795, drops 34)
2020-03-03 12:48:12,925 : INFO : deleting the raw counts dictionary of 66 items
2020-03-03 12:48:12,926 : INFO : sample=0.0001 downsamples 38 most-common words
2020-03-03 12:48:12,927 : INFO : downsampling leaves estimated 16781 word corpus (5.4% of prior 308761)
2020-03-03 12:48:12,927 : INFO : estimated required memory for 38 words and 1000 dimensions: 323000 bytes
2020-03-03 12:48:12,928 : INFO : resetting layer weights
2020-

In [15]:
# Function to average all word vectors in a paragraph
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [16]:
# Function for calculating the average feature vector
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
        
    return reviewFeatureVecs

In [17]:
trainDataVecs=getAvgFeatureVecs(words_train, model, num_features)
testDataVecs=getAvgFeatureVecs(words_test, model, num_features)

  del sys.path[0]


Review 0 of 3834
Review 1000 of 3834
Review 2000 of 3834
Review 3000 of 3834
Review 0 of 784


In [18]:
forest = RandomForestClassifier(n_estimators=100)
forest.fit(trainDataVecs, y_train)

NameError: name 'RandomForestClassifier' is not defined

## Evaluation metrics

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

pred = forest.predict(testDataVecs)
print(classification_report(y_test,pred))
print("Accuracy:", accuracy_score(y_test, pred))