# Project outline

1. Baseline model: words embeddings and sgd as given by the course

2. Words embeddings with downloaded library of twitter

3. Using LSTM after words embeddings part

In [1]:
from project2_data.project_text_classification.cooc import main
import numpy as np
from our_functions import *
import matplotlib.pyplot as plt
import pickle

%load_ext autoreload
%autoreload 2
%matplotlib inline

# Part 1: Baseline model: word embeddings and sgd with glove

### Load the training tweets and the built GloVe word embeddings. 

Do as told in README.md of project on github. 
(put train_pos.txt and train_neg.txt in main directory therefore or adapt cooc.py and build_vocab.sh)
Also downloaded file are of form 'train_neg.txt', and course given function take 'neg_train.txt. I changed but you might also need to adapt if full training set or something. 

build_vocab.sh  --> outputs vocab.txt <br>
cut_vocab.sh --> outputs vocab_cut.txt <br>

python pickle_vocab.py --> outputs **vocab.pkl**  <br>
Dictionary for each word with {key=word: value=index}   <br>
Len is (nb_word)

python cooc.py --> outputs cooc.pkl <br>
Co-occurence matrix

glove_solution.py  -->  outputs **embeddings.npy** <br>
Embedding of each words  <br>
shape is (nb_words, embedding_dimension + 1)

### Construct a feature representation of each training tweet (by averaging the word vectors over all words of the tweet)

In [2]:
def read_embeddings_vecs(embeddings, vocabulary):
    with open(vocabulary, 'rb') as voc:
        vocab = pickle.load(voc)
        
    words_embeddings = np.load('embeddings.npy')      # (nb_words, embedding_dimension)
        
    words = []                          # on veut les mots que une fois
    word_to_vec_map = {}  
    
    for word, idx in vocab.items():  
        words.append(word)                 #only possible because dict is ordered
        word_to_vec_map[word] = words_embeddings[idx, :]
        
    i = 1
    words_to_index = {}
    index_to_words = {}
    for w in sorted(words):
        words_to_index[w] = i
        index_to_words[i] = w
        i = i + 1
    
    return words_to_index, index_to_words, word_to_vec_map

In [3]:
#word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')
word_to_index, index_to_word, word_to_vec_map = read_embeddings_vecs('embeddings.npy', 'vocab.pkl')

In [4]:
# len(word_to_vec_map) = 21261
# len(vocab) = 21161

### Train a Linear Classifier: (e.g. logistic regression or SVM) on constructed features (scikit or our code)

#### The model is:
$$ z^{(i)} = W . avg^{(i)} + b$$
$$ a^{(i)} = softmax(z^{(i)})$$
$$ \mathcal{L}^{(i)} = - \sum_{k = 0}^{n_y - 1} Yoh^{(i)}_k * log(a^{(i)}_k)$$

In [5]:
def read_data(data):
    with open(data, "r") as file:
        tweets = str()
        for _, line in enumerate(file):
            tweets += line
        tweets = tweets.split('\n')
        del tweets[-1]
    return tweets

In [6]:
X_pos = read_data("./datasets/train_pos.txt")
X_neg = read_data("./datasets/train_neg.txt")

In [7]:
X_train = X_pos + X_neg

In [8]:
Y_pos = np.ones(len(X_pos), dtype = int)
Y_neg = np.zeros(len(X_neg), dtype = int) #np.ones(len(X_neg)) * (-1)
Y_train = np.concatenate((Y_pos, Y_neg), axis = -1)

In [9]:
from sklearn.utils import shuffle
X_train_shuffled, Y_train_shuffled = shuffle(X_train, Y_train, random_state=52)

In [10]:
def one_hot(Y):
    Y_hot = np.empty([len(Y), 2])
    Y_hot[Y== 1] = [1, 0]
    Y_hot[Y==-1] = [0, 1]
    return Y_hot

In [11]:
def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

In [12]:
def model(X, Y, word_to_vec_map, learning_rate = 0.01, num_iterations = 400):
    """
    Model to train word vector representations in numpy.
    
    Arguments:
    X -- input data, numpy array of sentences as strings, of shape (m, 1)
    Y -- labels, numpy array of integers between -1 and 1, numpy-array of shape (m, 1)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 20-dimensional vector representation
    learning_rate -- learning_rate for the stochastic gradient descent algorithm
    num_iterations -- number of iterations
    
    Returns:
    pred -- vector of predictions, numpy-array of shape (m, 1)
    W -- weight matrix of the softmax layer, of shape (n_y, n_h)
    b -- bias of the softmax layer, of shape (n_y,)
    """
    
    np.random.seed(32)

    # Define number of training examples
    m = len(Y)                              # number of training examples
    n_y = 2                                 # number of classes  
    n_h = 20                                # dimensions of the embeddings vectors 
    
    # Initialize parameters using Xavier initialization
    W = np.random.randn(n_y, n_h) / np.sqrt(n_h)
    b = np.zeros((n_y,))
    
    # Convert Y to Y_onehot with n_y classes
    Y_oh = convert_to_one_hot(Y, n_y)
    
    # Optimization loop
    for t in range(num_iterations):                       # Loop over the number of iterations
        for i in range(m):                                # Loop over the training examples
        
            # Average the word vectors of the words from the i'th training example
            avg = sentence_to_avg(X[i], word_to_vec_map)

            # Forward propagate the avg through the softmax layer
            z = W @ avg + b
            a = softmax(z)
            
            # Compute cost using the i'th training label's one hot representation and "A" (the output of the softmax)
            cost = - np.sum(Y_oh[i]*np.log(a))
            
            # Compute gradients 
            dz = a - Y_oh[i]
            dW = np.dot(dz.reshape(n_y,1), avg.reshape(1, n_h))
            db = dz

            # Update parameters with Stochastic Gradient Descent
            W = W - learning_rate * dW
            b = b - learning_rate * db
        
        if t % 10 == 0:
            print("Epoch: " + str(t) + " --- cost = " + str(cost))
            pred = predict(X, Y, W, b, word_to_vec_map)
    return W, b

In [16]:
W, b = model(X_train_shuffled, Y_train_shuffled, word_to_vec_map, learning_rate = 0.005, num_iterations = 30)

Epoch: 0 --- cost = 0.7186076235364628
Accuracy: 0.611075
Epoch: 10 --- cost = 0.7192028050243952
Accuracy: 0.611085
Epoch: 20 --- cost = 0.719202805024395
Accuracy: 0.611085


### Prediction: Predict labels for all tweets in the test set.

In [30]:
def get_test_data(data):
    with open(data, "r") as file:
        X_test = []
        ids = []
        for _, line in enumerate(file):
            ids.append( line.split(',', 1)[0] )
            X_test.append( " ".join(line.split(',', 1)[1:] ) )
    return ids, X_test

In [31]:
ids, X_test = get_test_data("test_data.txt")

In [32]:
X_test[5]

"<user> he needs to get rid of that thing ! it scares me lol but he don't need a car either . he needs drivers ed again .\n"

In [33]:
print("Training set:")
pred_train = predict(X_train_shuffled, Y_train_shuffled, W, b, word_to_vec_map)

Training set:
Accuracy: 0.611085


In [47]:
y_pred = predict_test(X_test, W, b, word_to_vec_map)
y_pred[y_pred == 0] = -1

### Submission / Evaluation: Submit your predictions to kaggle
Your submission file for the 10’000 tweets must be of the form: tweet-id, prediction

In [48]:
import csv
def create_csv_submission(ids, y_pred, name):
    """
    Function taken from helpers of project 1
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})


In [49]:
create_csv_submission(ids, y_pred, 'submissions/submission_model_1.csv')