# <center>Machine Learning Project n°2:
# <center>Text Sentiment Classification

  Tweets offer a gigantic database, that hints at useful informations about Natural Language Processing. They have the advantage of being constrained in size, which offers us a limitation in the number of words per tweet and will refer to one emotion from the user. This emotion can take various form, but for our analysis we will stick to a positive or negative sentiment. This jupyter notebook shows the different approaches we followed to determine the feeling of various tweets.

In [1]:
import pandas as pd
import numpy as np
import pickle
import re

from helpers import *

from nltk.tokenize import TweetTokenizer

from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models.word2vec import Word2Vec
# If error -> pip install --upgrade gensim

from tqdm import tqdm, tnrange
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas(desc="Process")
# If error -> pip install tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
# If error -> pip install -U nltk
nltk.download('wordnet') #Uncomment this line if first time using 'wordnet' corpus
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import SnowballStemmer
from nltk.corpus import stopwords
lmtzr = WordNetLemmatizer()
ps = PorterStemmer()
sb = SnowballStemmer("english")



[nltk_data] Downloading package wordnet to /Users/neuro/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/neuro/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/neuro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 1. Data pre-processing

### 1.1 Dataframe creation

In [2]:
# Import positive and negative tweets 
tweet_pos_df = pd.read_csv('twitter-datasets/train_pos_full.txt', 
                           names=['text'], delimiter="\t", header=None)
tweet_pos_df['sentiment'] = 1

tweet_neg_df = pd.read_csv('twitter-datasets/train_neg_full.txt', 
                           names=['text'], delimiter="\t", header=None)
tweet_neg_df['sentiment'] = -1

# Create the general dataframe
tweets_df = tweet_pos_df.append(tweet_neg_df)
tweets_df = tweets_df.reset_index(drop=True)
del tweet_pos_df, tweet_neg_df

tweets_df = tweets_df.sample(frac=1).reset_index(drop=True)

In [3]:
tweets_df.head(10)

Unnamed: 0,text,sentiment
0,guess i wasnt any help,-1
1,eben say hi to me please you would make my who...,1
2,steve madden's are really nice but why so maha...,-1
3,"dingo bone , large , 3-1 / 2 - ounce ( grocery...",-1
4,"<user> haha , not true , but thanks ... pretty...",1
5,<user> thanks jason,1
6,<user> follow <user> hes the guy you were list...,1
7,<user> lilo and stitch ! wow brings back memories,1
8,ugh ! ! why does my dog feel the need to const...,-1
9,my mums so much more tanned then me #paleshit ...,-1


### 1.2 Tweets processing
- Remove all punctation & all numbers (maybe not useful)
- Remove all user & url mention
- Remove stop words
- Tokenize (split) each tweet
- Tag word (noun, verb, adj, adv...)
- Lematize words
- Stem words

In [4]:
pos_dict = {'N' : 'n', 'V' : 'v', 'J' : 'a', 'S' : 's', 'R' : 'r'}

stop_words = set(stopwords.words('english'))

def convert_tag_to_pos(tag): #(for lemmatizer)
    """ Convert the tag given by pos_tag() into something lemmatize can understand """
    if tag in pos_dict.keys():
        return pos_dict[tag]
    else:
        return 'n' #Default value

def get_tokenized_tweet(tweet): 
    """ Clean tweet (remove punctations and numbers and tokenize it)"""
    # Remove all punctation & all numbers
    tweet = re.sub('[^A-Za-z ]+','', tweet)
    
    # Remove user & url
    #tweet = re.sub('user', '', tweet)                            
    #tweet = re.sub('url', '', tweet)
    
    tokens = TweetTokenizer().tokenize(tweet)
    filtered_sentence = []
 
    # Stop words filtering
    #for w in tokens:
    #    if w not in stop_words:
    #        filtered_sentence.append(w)
    #tokens = filtered_sentence
    #del filtered_sentence
   
    # Lemmatization 
    #tokens = [lmtzr.lemmatize(word,convert_tag_to_pos(tag)) for word,tag in tagged]
    
    # Stemming
    tokens = [sb.stem(word) for word in tokens]

    return tokens

In [5]:
tweets_df['tokenized text'] = tweets_df['text'].progress_map(get_tokenized_tweet)

tweets_df.head(10)

A Jupyter Widget




Unnamed: 0,text,sentiment,tokenized text
0,guess i wasnt any help,-1,"[guess, i, wasnt, ani, help]"
1,eben say hi to me please you would make my who...,1,"[eben, say, hi, to, me, pleas, you, would, mak..."
2,steve madden's are really nice but why so maha...,-1,"[steve, madden, are, realli, nice, but, whi, s..."
3,"dingo bone , large , 3-1 / 2 - ounce ( grocery...",-1,"[dingo, bone, larg, ounc, groceri, dingo, bone..."
4,"<user> haha , not true , but thanks ... pretty...",1,"[user, haha, not, true, but, thank, pretti, su..."
5,<user> thanks jason,1,"[user, thank, jason]"
6,<user> follow <user> hes the guy you were list...,1,"[user, follow, user, hes, the, guy, you, were,..."
7,<user> lilo and stitch ! wow brings back memories,1,"[user, lilo, and, stitch, wow, bring, back, me..."
8,ugh ! ! why does my dog feel the need to const...,-1,"[ugh, whi, doe, my, dog, feel, the, need, to, ..."
9,my mums so much more tanned then me #paleshit ...,-1,"[my, mum, so, much, more, tan, then, me, pales..."


## 2. Training the word2vec model


In [6]:
# Split data in test and train set
X_train, X_test, y_train, y_test = train_test_split(tweets_df['tokenized text'], 
                                                    tweets_df['sentiment'],
                                                    test_size=0.25,
                                                    random_state=42)

del tweets_df

### 2.1 word2vec model

- Process tweets for the word2vec model
- Create word2vec model

We wante to compute the word2vec model to transform any tweet in a vector. Once we have a vector for each word, we can get the associated sentence's vector.

In [7]:
def get_taggedDocument(X): 
    """ Prepare taggedDocument class from tweets for model"""
    taggedDocument = []
    
    for index in tqdm_notebook(range(len(X))):
        words = X[index]
        tags = "TRAIN_" + str(index)
        
        taggedDocument.append(TaggedDocument(words, tags))
        
    return taggedDocument

In [8]:
taggedDocument = get_taggedDocument(list(X_train))
all_words = [x.words for x in taggedDocument]
n_dim = 250

# Creation of the model
w2v_model = Word2Vec(size=n_dim, min_count=2,workers = 4) # Test with different min_count !

# Build vocab
print("Build vocab...")
w2v_model.build_vocab(all_words)

# Train the model
print("Train w2v model...")
w2v_model.train(all_words,
                total_examples=w2v_model.corpus_count,
                epochs=w2v_model.iter)


A Jupyter Widget


Build vocab...
Train w2v model...


94798253

In [9]:
w2v_model.most_similar('good')

[('goood', 0.7205197215080261),
 ('great', 0.7098403573036194),
 ('bad', 0.6105893850326538),
 ('nice', 0.5894044637680054),
 ('terribl', 0.5668543577194214),
 ('fab', 0.5608239769935608),
 ('horribl', 0.555421769618988),
 ('rubbish', 0.5260864496231079),
 ('decent', 0.5183205008506775),
 ('gud', 0.5112592577934265)]


### 2.2 tf-idf matrix

Tf–idf or TFIDF: numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.

In [10]:
# Create tf-idf matrix
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform(all_words)
tf_idf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

## 3. Classifier

In [11]:
def convert_tweet_to_vector(tweet, size):
    """ Convert a tweet in a vector based on the w2v model"""
    vector = np.zeros(size).reshape((1, size))
    
    for word in tweet:
        try:
            vector += w2v_model[word].reshape((1, size)) * tf_idf[word]
        except KeyError:
            continue
        
    return vector

def get_vectors(tweets):
    """ Create the matrix associated to a set of tweets"""
    n_dim=250
    all_vectors = [convert_tweet_to_vector(tweet, n_dim) for tweet in tqdm_notebook(tweets)]
    
    return np.concatenate(all_vectors)

In [12]:
# Compute the vectors matrix from tweets
X_train_w2v = get_vectors(X_train)
del X_train
X_test_w2v  = get_vectors(X_test)
del X_test

A Jupyter Widget




A Jupyter Widget




In [13]:
size = (50, 50, 50)
classifierPercZ = MLPClassifier(solver='adam', alpha=1e-7, hidden_layer_sizes = size, random_state=1)
classifierPercZ.fit(X_train_w2v, y_train)

MLPClassifier(activation='relu', alpha=1e-07, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(50, 50, 50), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

## 4. Results

In [14]:
print("The score on the test set is:", classifierPercZ.score(X_test_w2v, y_test))

The score on the test set is: 0.829487043892


## 5. Test on the unlabelized twitter set

In [15]:
# Import data
tweet_unlabelized_df = pd.read_csv('twitter-datasets/test_data.txt', 
                                   names=['text'], delimiter="\t", header=None)
tweet_unlabelized_index = tweet_unlabelized_df.index.values + 1

# Process data
X_unlabelized = tweet_unlabelized_df['text'].map(get_tokenized_tweet)
X_unlabelized_w2v = get_vectors(X_unlabelized)

# Make predictions
y_unlabelized = classifierPercZ.predict(X_unlabelized_w2v)

# Create submission file
create_csv_submission(tweet_unlabelized_index, y_unlabelized, "text_class_submission_top.csv")
print("Submission file created !")


A Jupyter Widget


Submission file created !
