Specific file is used to create vectors/feautures for each tweet
for both train and test data based on the embeddings of vocabulary words
that were produced with Word2Vec method.

In [1]:
import pandas as pd
import numpy as np
import csv
#Word Embeddings
import gensim 
from gensim.models import Word2Vec
import time

### .....................................................................................................................................................................................

## LOAD DATA (Word Embedded Tweets)

In [2]:
#Load train dataset
def load_data(file, col_names, n=0):
    #Read all data
    if n==0:
        data = pd.read_csv(file, sep="\t", header=None, names=col_names, quoting=csv.QUOTE_NONE, error_bad_lines=False)
    #Read specific number of rows of data
    else:
        data = pd.read_csv(file, nrows=n, sep="\t", header=None, names=col_names, quoting=csv.QUOTE_NONE, error_bad_lines=False)
    return data

In [3]:
# Retrieve cleaned up tweets for further usage
train_tweets_cleaned = pd.read_pickle("./train_tweets_cleaned.pkl")
test_tweets_cleaned = pd.read_pickle("./test_tweets_cleaned.pkl")

In [4]:
# CHANGE THIS ACCORDINGLY
# vector_size = 50
# vector_size = 100
# vector_size = 200
vector_size = 300

In [5]:
model_train = Word2Vec.load("./train_tweets_word2vec_vs"+str(vector_size)+".pkl")
vocabulary_train = list(model_train.wv.vocab)
model_test = Word2Vec.load("./test_tweets_word2vec_vs"+str(vector_size)+".pkl")
vocabulary_test = list(model_test.wv.vocab)

In [6]:
# Get Extra Features
train_extra_features = pd.read_pickle("./train_extra_vs"+str(vector_size)+".pkl")
test_extra_features = pd.read_pickle("./test_extra_vs"+str(vector_size)+".pkl")

### .....................................................................................................................................................................................

## TWEETS EMBEDDINGS

In [7]:
# Create a default vector for words that have not been vectorized
# Σ<vectored_words>/<num_vectored_words>
def DefaultVector(model, vocabulary):
    default_vector = np.zeros(vector_size)
    for word in vocabulary:
        default_vector = np.add(default_vector, model.wv[word])
    return default_vector/len(vocabulary)

In [8]:
# Create vectors for each tweet
def Tweet2Vector(tweet, model, vocabulary):
    tweet_vector = np.zeros(vector_size)
    non_vectored_words = 0
    
    # Traverse through each word of the tweet
    for word in tweet:
        #Search word in vocabulary
        if word in vocabulary:
            tweet_vector = np.add(tweet_vector, model.wv[word])
        else:
            non_vectored_words = non_vectored_words + 1
        
    # If there are words that have been not vectorized
    # for each one of them create a vector equal to Σ<vectored_words>/<num_vectored_words>
    tweet_vector = np.add(tweet_vector, non_vectored_words*default_vector)
    return tweet_vector

# Tweet2Vector(train_tweets_cleaned.at[0,'Tweet'], model_train, vocabulary_train, 200)

In [9]:
#Create new dataframe with the vectorized tweets and more of their information
def Modeled_Tweets(col_names, cleaned_tweets, model, vocabulary):
    # Create new dataframe for vectorized tweets
    tweets_modeled = pd.DataFrame()
    tweets_modeled = cleaned_tweets[col_names].copy()
    # Create vectors for each tweet
    tweets_modeled['Vector'] = cleaned_tweets['Tweet'].apply(lambda tweet: Tweet2Vector(tweet, model, vocabulary))
#     tweets_modeled = tweets_modeled[0:5]
    return tweets_modeled

In [10]:
# TRAIN TWEETS
default_vector = DefaultVector(model_train, vocabulary_train)
train_tweets_modeled = Modeled_Tweets(['TweetID', 'Tweet', 'Sentiment'],
                                     train_tweets_cleaned,
                                     model_train,
                                     vocabulary_train)

train_tweets_modeled.head()

Unnamed: 0,TweetID,Tweet,Sentiment,Vector
0,264183816548130816,"[gas, hous, hit, go, chapel, hill, sat, :)]",positive,"[0.5317999795079231, -1.3948887214064598, -0.9..."
1,263405084770172928,"[theo, walcott, still, shit, watch, rafa, john...",negative,"[0.4684139392338693, -1.170335978269577, -0.70..."
2,262163168678248449,"[gsp, fan, hate, nick, diaz, cant, wait, febru...",negative,"[0.4220394731709353, -1.1228334289729296, -0.8..."
3,264249301910310912,"[iranian, general, say, israel, iron, dome, ca...",negative,"[0.4096992301964752, -2.1316297520637213, -1.1..."
4,262682041215234048,"[tehran, mon, amour, obama, tri, establish, ti...",neutral,"[-0.03934148471325797, -2.473725941169202, -0...."


In [11]:
# TEST TWEETS
default_vector = DefaultVector(model_test, vocabulary_test)
test_tweets_modeled = Modeled_Tweets(['TweetID', 'Tweet'],
                                     test_tweets_cleaned,
                                     model_test,
                                     vocabulary_test)

test_tweets_modeled.head()

Unnamed: 0,TweetID,Tweet,Vector
0,801989080477154944,"[ari, ariana, grand, full]","[0.10410167671424628, -0.549238750863187, -0.0..."
1,801989272341453952,"[ariana, grand, kii, fm, truli, cd, listen, pa...","[0.24175302191622494, -1.2834049896243584, -0...."
2,801990978424962944,"[ariana, grand, white, hous, easter, egg, roll...","[0.2684575153855729, -1.078699084150903, -0.08..."
3,801996232553963008,"[ariana, grand, sweet, like, candi, oz, ml, se...","[0.28051567165781977, -1.56956492425248, -0.07..."
4,801998343442407040,"[side, side]","[0.05338717997074127, -0.30022650957107544, -0..."


### .....................................................................................................................................................................................

## ADD MORE FEAUTURES TO TWEET VECTOR

In [12]:
# TRAIN DATA
for index, row in train_tweets_modeled.iterrows():
    tweet = row['Tweet']
    vector = row['Vector']
    vector = np.append(vector, train_extra_features.loc[index, :].values.tolist())
    train_tweets_modeled.at[index, "Vector"] = vector
    
# TEST DATA
for index, row in test_tweets_modeled.iterrows():
    tweet = row['Tweet']
    vector = row['Vector']
    vector = np.append(vector, test_extra_features.loc[index, :].values.tolist())
    test_tweets_modeled.at[index, "Vector"] = vector

### .....................................................................................................................................................................................

In [13]:
# Save vectorized tweets for later further use
train_tweets_modeled.to_pickle("./train_tweets_vectorized_Word2Vec_vs"+str(vector_size)+".pkl")
test_tweets_modeled.to_pickle("./test_tweets_vectorized_Word2Vec_vs"+str(vector_size)+".pkl")