In this notebook I'm gonna try Doc2Vec; a document embeddding technique for feature representation in NLP and I will use it for text classification

In [2]:
# first lets do some imports
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import nltk 
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [3]:
# Lets load the data first
df = pd.read_csv("./Data/Sentiment and Emotion in Text/train_data.csv")

In [4]:
df.shape

(30000, 2)

In [5]:
df.head(10)

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...
5,worry,Re-pinging @ghostridah14: why didn't you go to...
6,sadness,"I should be sleep, but im not! thinking about ..."
7,worry,Hmmm. http://www.djhero.com/ is down
8,sadness,@charviray Charlene my love. I miss you
9,sadness,@kelcouch I'm sorry at least it's Friday?


In [6]:
# Lets check how many categories are there
df['sentiment'].value_counts()

worry         7433
neutral       6340
sadness       4828
happiness     2986
love          2068
surprise      1613
hate          1187
fun           1088
relief        1021
empty          659
enthusiasm     522
boredom        157
anger           98
Name: sentiment, dtype: int64

This is certainly a multi class classification

For this experiment lets just use top 3 categories

In [8]:
shortlist = ['neutral','happiness','worry']

In [11]:
df_subset = df[df['sentiment'].isin(shortlist)]
df_subset.shape

(16759, 2)

### Text PreProcessing

 - Removing @mentions and urls
 - use NLTK TWEET tokenizer
 - usual normalizations step

In [12]:
# https://www.nltk.org/api/nltk.tokenize.casual.html
# check the docs for details

# preserve_case. By default, it is set to True. If it is set to False, then the tokenizer
# will downcase everything except for emoticons.

# strip_handles. By default, it is set to False. It specifies whether to remove Twitter handles
# of text used in the tokenize method.


tweeter = TweetTokenizer(strip_handles=True, preserve_case=False)
mystopwords = set(stopwords.words("english"))

# content : one feature vector AKA one sample


def preprocess_corpus(texts):
    def remove_stops_digits(tokens):
        return [token for token in tokens if token not in mystopwords and not token.isdigit()]
    return [remove_stops_digits(tweeter.tokenize(content)) for content in texts]


mydata = preprocess_corpus(df_subset['content'])
mycats = df_subset['sentiment']

In [14]:
# Lets do train test split now
train_data,test_data,train_cats,test_cats = train_test_split(
    mydata,mycats,random_state=42
)

In [15]:
# First you need to tag the data and make it ready for training, lets do that

train_doc2vec = [TaggedDocument((d), tags=[str(i)]) for i,d in enumerate(train_data)]


In [16]:
type(train_doc2vec)

list

In [17]:
train_doc2vec[:5]

[TaggedDocument(words=['going', 'start', 'using', 'fast', 'access', 'm.twitter.com', 'school'], tags=['0']),
 TaggedDocument(words=['happy', 'star', 'wars', 'day', '!'], tags=['1']),
 TaggedDocument(words=['getting', 'ready', 'school', 'hopfully', 'today', 'good', 'day'], tags=['2']),
 TaggedDocument(words=['pavement', 'boiling', 'hot', ',', 'dogs', 'limping', '.', 'guess', "summer's", 'officially', '.'], tags=['3']),
 TaggedDocument(words=['anything', 'accepted', 'except', 'christianity', '.', 'google', 'discussion', 'thread', '"', 'sexuality', 'religion', '"', 'sled', 'second', 'life', '.'], tags=['4'])]

In [19]:
# Lets train my doc2vec model
model = Doc2Vec(
    vector_size=50,
    alpha=0.025,
    min_count=5,
    dm=1,
    epochs=100
)

model.build_vocab(train_doc2vec)
model.train(
    train_doc2vec,
    total_examples=model.corpus_count,
    epochs=model.epochs
)
model.save("d2v.model")
print("Model Saved")

Model Saved


Model Size is around 4MB

#### Modelling

In [20]:
model = Doc2Vec.load("d2v.model")

In [24]:
train_vectors =  [model.infer_vector(list_of_tokens) for list_of_tokens in train_data]
test_vectors = [model.infer_vector(list_of_tokens) for list_of_tokens in test_data]

In [25]:
type(train_vectors)

list

In [29]:
train_vectors[0]

array([-0.32290205, -0.1476218 ,  0.19532248, -0.07156629,  0.12674208,
       -0.48220414, -0.47260845,  0.05374419,  0.04003971,  0.47250116,
       -0.04746957, -2.502907  ,  0.04083798, -0.43603304,  0.88077235,
       -0.43018845, -1.1216704 ,  0.10489108, -0.4669931 ,  0.06127795,
        0.5249197 , -0.3624527 ,  0.9820764 , -0.16685843,  0.65624183,
        0.06044985, -0.8895854 ,  1.1260021 ,  0.28771064,  0.02224413,
       -1.1069902 , -0.7974283 , -0.45196617,  0.02421224,  0.00730934,
        0.37003765, -0.40049896,  0.26372626, -0.27938882, -0.18127759,
        0.6289527 ,  0.48223737, -0.21647307,  1.0628011 ,  1.0584211 ,
       -0.10752965, -0.02682967, -0.11258072, -0.4142734 , -0.0156419 ],
      dtype=float32)

In [30]:
from sklearn.linear_model import LogisticRegression

myclass = LogisticRegression(class_weight="balanced")
myclass.fit(train_vectors,train_cats)

preds = myclass.predict(test_vectors)

from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(test_cats,preds))

              precision    recall  f1-score   support

   happiness       0.35      0.52      0.42       724
     neutral       0.45      0.56      0.50      1586
       worry       0.60      0.37      0.46      1880

    accuracy                           0.47      4190
   macro avg       0.47      0.48      0.46      4190
weighted avg       0.50      0.47      0.47      4190

