# 1. Opening and preprocessing of the Stanford data

In [90]:
def read_examples(infile):
    """ Reads a .examples file and returns a list of Example instances """
    stream = open(infile) # open the file
    liste = []
    while 1:
        texte = stream.read()  # read the file
        if not texte:
            break
       # line = texte.startwith("-1") or texte.startwith("1") 
        texte = texte[0:-1]
        
        exemple = texte.split('\n')  # sseparate each line 
        for ex in exemple:           # iteration on each line
            if ex.startswith("-1"):  # condition to give the gold class (here -1)
                gold_class = "-1"
                review = ex[3:]
            elif ex.startswith("1"):   # condition to give the gold class (here 1)
                gold_class = "1"
                review = ex[2:]          
            dico = {gold_class: review}   # creation of a dictionnary for each review
            liste.append(dico)       # append each dictionnary to the liste

    return liste

In [91]:
stanford_dev = read_examples("../data/English/evaluations/sentiment analysis/stanford_raw_dev.txt")
print(stanford_dev)

[{'1': "It 's a lovely film with lovely performances by Buy and Accorsi ."}, {'1': "And if you 're not nearly moved to tears by a couple of scenes , you 've got ice water in your veins ."}, {'1': 'A warm , funny , engaging film .'}, {'1': 'Uses sharp humor and insight into human nature to examine class conflict , adolescent yearning , the roots of friendship and sexual identity .'}, {'1': 'Entertains by providing good , lively company .'}, {'1': "Dazzles with its fully-written characters , its determined stylishness ( which always relates to characters and story ) and Johnny Dankworth 's best soundtrack in years ."}, {'1': 'Visually imaginative , thematically instructive and thoroughly delightful , it takes us on a roller-coaster ride from innocence to experience without even a hint of that typical kiddie-flick sentimentality .'}, {'1': "Nothing 's at stake , just a twisty double-cross you can smell a mile away -- still , the derivative Nine Queens is lots of fun ."}, {'1': 'Unlike the

In [92]:
stanford_test = read_examples("../data/English/evaluations/sentiment analysis/stanford_raw_test.txt")
print(stanford_test)



In [93]:
stanford_train = read_examples("../data/English/evaluations/sentiment analysis/stanford_raw_train.txt")
print(stanford_train)



In [210]:
from sklearn.linear_model import Perceptron
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
import numpy as np

from sklearn.neural_network import MLPClassifier

In [95]:
reviews_train = [list(data.values())[0] for data in stanford_train] #list of movie reviews in the train set
gold_train = [int(list(data.keys())[0]) for data in stanford_train] #list of corresponding gold class in the train set
reviews_test = [list(data.values())[0] for data in stanford_test] #list of movie reviews in the test set
gold_test = [int(list(data.keys())[0]) for data in stanford_test] #list of corresponding gold class in the test set
reviews_dev = [list(data.values())[0] for data in stanford_dev] #list of movie reviews in the dev set
gold_dev = [int(list(data.keys())[0]) for data in stanford_dev] #list of corresponding gold class in the dev set

In [96]:
print(len(reviews_train))
print(len(gold_train))
print(len(reviews_test))
print(len(gold_test))
print(len(reviews_dev))
print(len(gold_dev))

6920
6920
1821
1821
872
872


# 2. Dowloading of word embeddings

In [211]:
def downloaded_vectors(file_vectors):
    #load pre-trained word embeddings
    word_embeddings = {}
    #open file where the embeddings are stored
    with open(file_vectors, 'r') as file:
        #for every line in the file
        for line in file:
            #split by whitespace
            values = line.split()
            #first item in the vector represents a word
            word = values[0]
            #the remaining of the values represent the numerical representation of the word
            #conversion of the vector values from a list to a NumPy array
            vector = np.array(values[1:]).astype(np.float64)
            #word is added as key and the corresponding numerical representation (vector) to the word_embeddings dictionary
            word_embeddings[word] = vector

    #print vector associated to 'talk'
    #print(word_embeddings['cégep'])
    return word_embeddings

In [188]:
print(np.array(values[1:]))

['-0.0246' '-0.0041' '0.0566' '-0.0248' '0.0293' '0.0559' '-0.1027'
 '0.0659' '0.0354' '0.0350' '-0.0103' '0.0134' '-0.0251' '0.0288'
 '-0.0713' '0.0521' '0.0106' '0.0616' '-0.0308' '0.0967' '-0.0931'
 '-0.0693' '-0.0551' '-0.0319' '0.0637' '-0.0569' '0.0412' '-0.0227'
 '-0.0063' '-0.0437' '-0.0157' '0.1220' '0.0348' '-0.0554' '0.0262'
 '-0.1458' '-0.0544' '0.0061' '-0.1053' '0.0085' '0.0319' '0.0054'
 '0.0197' '0.0102' '-0.0024' '0.0711' '0.0373' '-0.0105' '0.0103'
 '-0.0966' '0.0292' '-0.0262' '0.0028' '-0.0826' '-0.0026' '0.0154'
 '0.1174' '0.0367' '-0.0536' '-0.0374' '-0.0302' '-0.0213' '-0.0562'
 '0.0120' '0.0321' '0.0823' '-0.0640' '0.0157' '-0.0918' '-0.0434'
 '0.1430' '0.0333' '0.0434' '0.0283' '0.0066' '-0.0609' '-0.0086'
 '-0.0585' '0.0354' '0.0936' '0.0423' '0.0559' '-0.0422' '-0.1124'
 '-0.0525' '0.0543' '-0.0116' '-0.0578' '0.0696' '-0.1348' '0.0136'
 '-0.0673' '-0.0015' '-0.0052' '-0.0940' '0.0006' '-0.0412' '-0.0673'
 '-0.0455' '-0.0463' '0.0053' '-0.0715' '0.0550' '-0.0

# 2.1 Using pre-trained word embeddings

In [256]:
pre_trained_word_embeddings = downloaded_vectors("../data/English/wordEmbeddings/vectors_datatxt_250_sg_w10_i5_c500_gensim_clean")

In [214]:
pre_trained_word_embeddings 

{'125776': array([250.]),
 ',': array([ 2.45136e-01, -4.02530e-02,  1.42044e-01, -2.11430e-02,
        -4.95580e-02, -1.08356e-01, -8.67740e-02,  2.87230e-02,
         2.59862e-01, -2.39028e-01,  3.52390e-02, -1.84145e-01,
         3.40270e-02, -5.92840e-02, -1.35190e-01,  3.69910e-02,
         2.12820e-02,  3.61977e-01,  1.46804e-01,  2.22982e-01,
        -6.66460e-02, -3.66901e-01, -2.33584e-01,  1.38813e-01,
         2.30892e-01, -6.25850e-02, -2.22701e-01,  3.13545e-01,
        -2.56864e-01,  1.80580e-02, -1.86710e-01, -4.63920e-02,
         2.93384e-01, -1.24467e-01,  9.27140e-02,  7.02960e-02,
         2.53593e-01, -9.87760e-02, -9.51080e-02,  1.36413e-01,
         1.58367e-01, -2.70100e-03,  2.05611e-01, -1.28678e-01,
        -6.65170e-02,  4.02990e-02, -1.99219e-01, -1.74021e-01,
        -1.35269e-01, -1.68121e-01, -1.10021e-01, -1.08802e-01,
         2.18593e-01,  9.65700e-03, -9.67790e-02,  1.29937e-01,
        -1.69920e-02,  2.50951e-01,  2.06890e-02,  2.78278e-01,
         

# 2.2 Using retrofitted word embeddings

In [255]:
retrofitted_word_embeddings = downloaded_vectors("../data/English/output_vectors/output_vectors.txt")

# 3. Creation of vectors for the film reviews

## 3.1 Creating these review vectors with the average of the word vectors

In [222]:
#vectorises a single movie review using embeddings
def vectorize_review_average(review, word_embeddings):
    #split review into individual tokens
    tokens = review.split() 
    #retrieve vector for each token
    #if token isn't found, return a zero vector (same shape as the word embeddings)
    vectors = [word_embeddings.get(token, np.zeros_like(word_embeddings["word"])) for token in tokens]
    #print('ici')
    #print(vectors)
    #print('la')
    #mat = np.vstack(vectors)
    #average of word embeddings into a single vector corresponding to a review
    #print(mat)
    combined_vector = np.mean(vectors,axis=0)
    #print('lla')
    #print(combined_vector)
    #return vectorial representation for the given review
    return combined_vector

In [223]:
#vectorises movie reviews given an input file
def vec_reviews_average(file, word_embeddings):
    #list of vector reviews
    vectorized_reviews = []
    #for every review in file
    for review in file:
        #vectorise the review with the word_embeddings
        review_vector = vectorize_review_average(review, word_embeddings)
        #append the vector review to list
        vectorized_reviews.append(review_vector)
    #return list of vector reviews
    return vectorized_reviews

### 3.1.1 Pre- trained vectors

In [262]:
X_train_pre_trained_vectors_average = vec_reviews_average(reviews_train, pre_trained_word_embeddings)
X_test_pre_trained_vectors_average = vec_reviews_average(reviews_test,pre_trained_word_embeddings)
X_dev_pre_trained_vectors_average = vec_reviews_average(reviews_dev, pre_trained_word_embeddings)

### 3.1.2 Retrofitted vectors

In [261]:
X_train_retrofitted_vectors_average = vec_reviews_average(reviews_train, retrofitted_word_embeddings)
X_test_retrofitted_vectors_average = vec_reviews_average(reviews_test,retrofitted_word_embeddings)
X_dev_retrofitted_vectors_average = vec_reviews_average(reviews_dev, retrofitted_word_embeddings)

## 4. Classifier results
* Perceptron

### 4.1 Accuracy of sentiment analysis task with pre-trained vectors (before retrofitting)

In [263]:
perceptron = Perceptron()
perceptron.fit(X_train_pre_trained_vectors_average, Y_train)
prediction = perceptron.predict(X_test_pre_trained_vectors_average)
# accuracy
accuracy = accuracy_score(Y_test, prediction)
print("accuracy on test set (before retrofitting):", round(accuracy, 3), "(",round(accuracy*100,2),"%)")

accuracy on test set (before retrofitting): 0.768 ( 76.77 %)


* Multilayer Perceptron

In [264]:
mlp = MLPClassifier()
mlp.fit(X_train_pre_trained_vectors_average, Y_train)
y_pred = mlp.predict(X_test_pre_trained_vectors_average)
# accuracy
accuracy = accuracy_score(Y_test, y_pred)
print("accuracy on test set (before retrofitting):", round(accuracy, 3), "(",round(accuracy*100,2),"%)")

accuracy on test set (before retrofitting): 0.78 ( 77.98 %)




### 4.1 Accuracy of sentiment analysis task with retrofitted vectors (after retrofitting)
* Perceptron

In [267]:
perceptron = Perceptron()
perceptron.fit(X_train_retrofitted_vectors_average, Y_train)
prediction = perceptron.predict(X_test_retrofitted_vectors_average)
# accuracy
accuracy = accuracy_score(Y_test, prediction)
print("accuracy on test set (after retrofitting):", round(accuracy, 3), "(",round(accuracy*100,2),"%)")

accuracy on test set (after retrofitting): 0.719 ( 71.94 %)


* Multilayer Perceptron

In [268]:
mlp = MLPClassifier()
mlp.fit(X_train_retrofitted_vectors_average, Y_train)
y_pred = mlp.predict(X_test_retrofitted_vectors_average)
# accuracy
accuracy = accuracy_score(Y_test, y_pred)
print("accuracy on test set (after retrofitting):", round(accuracy, 3), "(",round(accuracy*100,2),"%)")

accuracy on test set (after retrofitting): 0.753 ( 75.34 %)


