For this file, it will use all data to train model.

Except MLPClassifier change max_iter to 1

In [42]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
import nltk
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

import gensim.downloader as api

from Dataset_Preparation import load_dataset, DATA_PATH

import warnings
import gc

warnings.filterwarnings("ignore")
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

You could adjust param and path of performance.

Besides, `DATA_SIZE` is determined the size of dataset you want to use. Set it to 0 means use all data

In [2]:
PREF_PATH = "performance.txt"
# set 0 to use all data
DATA_SIZE = 0
MNB_PARAM = {'alpha': [0.5, 0.0, 1.0, 10.0]}
DT_PARAM = {'criterion': ['gini', 'entropy'],
            'max_depth': [5, 10],
            'min_samples_split': [2, 4, 6]}
MLP_PARAM = {'hidden_layer_sizes': [(30, 50), (10, 10, 10)],
             'activation': ['logistic', 'tanh', 'relu', 'identity'],
             'solver': ['sgd', 'adam'],
             'max_iter': [1]}

Those are functions from `Words_as_features.py`, some of them differ from original ones for compatibility reasons

In [3]:
# Process the dataset using feature sklearn.extraction.text.CountVectorizer to extract tokens/words
# and their frequencies. Display the number of tokens (the size of the vocabulary) in the dataset
def extract_features(data):
    vectorizer = CountVectorizer()
    words = [item[0] for item in data]
    count_vector = vectorizer.fit_transform(words)
    return vectorizer, count_vector


def to_frequencies(X_train_counts):
    from sklearn.feature_extraction.text import TfidfTransformer
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    return X_train_tfidf


def split_data(data, tokenize=False, tf_idf=False, count_vector=None, size_of_data=DATA_SIZE):
    from nltk.tokenize import word_tokenize
    if tokenize:
        X = [word_tokenize(item[0]) for item in data]
    else:
        if count_vector is not None:
            if tf_idf:
                X = to_frequencies(count_vector)
            else:
                X = count_vector
        else:
            X = [item[0] for item in data]
    y1 = [item[1] for item in data]
    y2 = [item[2] for item in data]
    if size_of_data == 0:
        size_of_data = min([len(y1), len(y2)])
    return train_test_split(X[:size_of_data], y1[:size_of_data], test_size=0.2, random_state=42), \
           train_test_split(X[:size_of_data], y2[:size_of_data], test_size=0.2, random_state=42)


def train_models(model, X_train, y_train, param_grid=None):
    if param_grid is not None:
        model = GridSearchCV(estimator=model, param_grid=param_grid)
    model.fit(X_train, y_train)
    return model


def classification_task(model, X_test, y_test):
    from sklearn.metrics import confusion_matrix, classification_report
    y_pred = model.predict(X_test)
    matrix = confusion_matrix(y_test, y_pred, labels=np.unique(y_pred))
    report = classification_report(y_test, y_pred, labels=np.unique(y_pred))
    return matrix, report


def save_performance(model, X_test, y_test, data_type: str, path=PREF_PATH):
    matrix, report = classification_task(model, X_test, y_test)
    result = data_type + " " + str(model) + ":" + str(model.get_params()) + "\n"
    result += "Confusion Matrix: \n" + str(matrix) + "\n"
    result += "Classification Report: \n" + str(report) + "\n"
    result += "--------------------------------------------\n"
    print(result)
    with open(path, 'a+') as f:
        f.write(result)

Those are functions from `Embeddings_as_Features.py`, some of them differ from original ones for compatibility reasons

In [4]:
def get_mean_vector(word2vec_model,  words):
    # remove out-of-vocabulary words
    words = [word for word in words if word in word2vec_model.wv.vocab.keys()]
    if len(words) >= 1:
        return np.mean(word2vec_model[words], axis=0)
    else:
        return []


def get_post_hit_rate(model, set):
    hits = [1 for i in set if i in model.vocab.keys()]
    return sum(hits)/len(set)


def sent_vectorizer(sent, model):
    sent_vec = []
    numw = 0
    for w in sent:
        try:
            if w not in model.vocab.keys():
              continue
            if numw == 0:
                sent_vec = model.wv[w]
            else:
                sent_vec = np.add(sent_vec, model[w])
            numw += 1
        except:
          pass

    return np.asarray(sent_vec) / numw


def data_vectorizer(sents, model):
    data_vec = []
    for sent in sents:
        data_vec.append(sent_vectorizer(sent, model))
    return data_vec

Here is about Words as Features

In [5]:
def word(tf_idf=False, path=PREF_PATH):

    data = load_dataset(DATA_PATH)
    vectorizer, count_vector = extract_features(data)
    print("The size of the vocabulary is: ", len(vectorizer.vocabulary_))
    # Split the dataset into 80% for training and 20% for testing

    (X_train, X_test, y_train_e, y_test_e), (t1, t2, y_train_s, y_test_s) = split_data(data, tf_idf=tf_idf, count_vector=count_vector)
    # X_train_e, X_test_e, y_train_e, y_test_e = train_test_split(X_train[0:100], emotion[0:100], test_size=0.2,
    #                                                             random_state=0)
    # X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_train[0:100], sentiment[0:100], test_size=0.2,
    #                                                             random_state=0)

    # Train a Multinomial Naive Bayes classifier on the training set

    models = [DecisionTreeClassifier(), MultinomialNB(), MLPClassifier(max_iter=1)]
    # models = [MLPClassifier(max_iter=1)]
    with open(path, "w") as f:
        f.write("")
    for model in models:
        emotion_model = train_models(model, X_train, y_train_e)
        save_performance(emotion_model, X_test, y_test_e, "Emotion",path=path)
        sentiment_model = train_models(model, X_train, y_train_s)
        save_performance(sentiment_model, X_test, y_test_s, "Sentiment",path=path)
        print("Model trained: ", str(model))
        if "MultinomialNB" in str(model):
            emotion_top_model = train_models(model, X_train, y_train_e, MNB_PARAM)
            sentiment_top_model = train_models(model, X_train, y_train_s, MNB_PARAM)
        elif "DecisionTreeClassifier" in str(model):
            emotion_top_model = train_models(model, X_train, y_train_e, DT_PARAM)
            sentiment_top_model = train_models(model, X_train, y_train_s, DT_PARAM)
        elif "MLPClassifier" in str(model):
            emotion_top_model = train_models(model, X_train, y_train_e, MLP_PARAM)
            sentiment_top_model = train_models(model, X_train, y_train_s, MLP_PARAM)
        else:
            print("Model not found")
            break
        save_performance(emotion_top_model, X_test, y_test_e, "Emotion", path=path)
        save_performance(sentiment_top_model, X_test, y_test_s, "Sentiment", path=path)
        print("GridSearchCV trained: ", str(model))

    print("Done!")

In [None]:
word(path=PREF_PATH)
# Use tf-idf instead of word frequencies and redo all substeps of 2.3 above – you can use TfidfTransformer
# for this. Display the results of this experiment

word(tf_idf=True,path = "tf_performance.txt")

The size of the vocabulary is:  30449
Emotion MLPClassifier(max_iter=1):{'activation': 'relu', 'alpha': 0.0001, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_fun': 15000, 'max_iter': 1, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'solver': 'adam', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}
Confusion Matrix: 
[[1077   24    4    3   25    9    0    5   10    2    3    1    1    4
     0   40   30   74  751   19    0    0    3   36]
 [  42  688    7    7    2    0    2    4    4    0    2    1    0    1
     0    7   25   12  381    4    0    0    2    9]
 [  12   12  261   31    1    0    1    3    1    1   13   11    0    0
     3    4    5    2  633    3    0    2    6    4]
 [  28   39  107   61    9    8    4    6    2    5 

In [6]:
def embedding(path=PREF_PATH,size=100,vector=None):
    google_vectors = vector
    print(google_vectors.vector_size)
    datas = load_dataset(DATA_PATH)
    # clean data
    data = []
    for value in datas:
      words = word_tokenize(value[0])
      if len(get_mean_vector(google_vectors, words)) >= 1:
        data.append(value)
    print("total remove:" + str(len(datas) - len(data)))
    # 3.2
    (X_train, X_test, y_train_e, y_test_e), (_, _, y_train_s, y_test_s) = split_data(data, tokenize=True, size_of_data=size)

    print("The size of the tokens is: ", len(X_train))
    # Compute the embedding of a Reddit post as the average of the embeddings of its words. If
    # a word has no embedding in Word2Vec, skip it.
    sentence_embeddings = get_mean_vector(google_vectors, X_train[0])
    average = sum(sentence_embeddings)/len(sentence_embeddings)
    print(average)
    # 3.3
    embeddings_train = []
    for post in X_train:
        embedding = get_mean_vector(google_vectors, post)
        embeddings_train.append(sum(embedding)/len(embedding)if len(embedding)!=0 else 0)

    print(embeddings_train[0:100])


    # 3.4
    # Compute and display the overall hit rates of the training and test sets
    train_hit_rates = []
    for post in X_train[0:100]:
        train_hit_rates.append(get_post_hit_rate(google_vectors, post))
    print("The hit rate of the training set is: ", sum(train_hit_rates)/len(train_hit_rates))
    test_hit_rates = []
    for post in X_test[0:100]:
        test_hit_rates.append(get_post_hit_rate(google_vectors, post))
    print("The hit rate of the test set is: ", sum(test_hit_rates)/len(test_hit_rates))

    # 3.5

    X_train = data_vectorizer(X_train, google_vectors)
    X_test = data_vectorizer(X_test, google_vectors)
    # X_train = embeddings_train
    # X_test = embeddings_test
    # test = 0
    # for value in X_train:
    #   if test == 0:
    #     test = value.shape
    #   elif value.shape != test:
    #     print(value.shape)
    #     break
    with open(path, "w") as f:
        f.write("")
    model = MLPClassifier(max_iter=1)
    base_MLP_e = train_models(model, X_train, y_train_e)
    save_performance(base_MLP_e, X_test, y_test_e, "base_MLP_e", path=path)
    base_MLP_s = train_models(model, X_train, y_train_s)
    save_performance(base_MLP_s, X_test, y_test_s, "base_MLP_s", path=path)
    top_MLP_e = train_models(model, X_train, y_train_e, param_grid=MLP_PARAM)
    save_performance(top_MLP_e, X_test, y_test_e, "top_MLP_e", path=path)
    top_MLP_s = train_models(model, X_train, y_train_s, param_grid=MLP_PARAM)
    save_performance(top_MLP_s, X_test, y_test_s, "top_MLP_s", path=path)
    # print(type(base_MLP_e), type(top_MLP_e), type(base_MLP_s), type(top_MLP_s))
    

In [None]:
google = api.load("word2vec-google-news-300")

In [None]:
embedding(path="embedding.txt", size=0, vector=google)

300
['RemindMe', '!']
removed:RemindMe!
['and', '...', '?']
removed:and...?
['cuteee']
removed:cuteee
['...', 'and', '?']
removed:...and?
['judah', '!', '!']
removed:judah!!
['(', ':', ')']
removed:(:)
['A-fucking-men', '!', ':', ')']
removed:A-fucking-men! :)
['...', 'and', '?']
removed:...and?
['Intp', ':', '(']
removed:Intp:(
['Ahahahahahaha', '!', '!']
removed:Ahahahahahaha!!
['Infacy', 'gosples', '?']
removed:Infacy gosples?
['and', '...', '?']
removed:and...?
['Nope.cool', '!', '🅱️RO']
removed:Nope.cool !🅱️RO
['Ahahahahahaha', '!', '!']
removed:Ahahahahahaha!!
['^She', '^took', '^me', '^by', '^the', '^hand', '...', '^made', '^me', '^a', '^man', '...', '^THAT', '^ONE', '^NIGHT', '!']
removed:^She ^took ^me ^by ^the ^hand... ^made ^me ^a ^man... ^THAT ^ONE ^NIGHT!
['Infacy', 'gosples', '?']
removed:Infacy gosples?
['Chuckled', '.', 'Upvote', '!']
removed:Chuckled. Upvote!
['DUTCHY', '!', '!', '!', '!', '!', '!', '!', '!', '!', '!', '!', '!', '!']
removed:DUTCHY!!!!!!!!!!!!!
['whyyy

Now begin 3.8

base on our result, we select base_MLP_s and base_MLP_e as the best model

we will use conceptnet-numberbatch-17-06-300 and glove-twitter-200 for 3.8

In [43]:
gc.collect() # Try to release the previous model

143

In [None]:
conceptnet = api.load("conceptnet-numberbatch-17-06-300")

In [39]:
def get_mean_vector_conceptnet(word2vec_model,  words):
    # remove out-of-vocabulary words
    words = [('/c/en/'+word) for word in words if ('/c/en/'+word) in word2vec_model.wv.vocab.keys()]
    if len(words) >= 1:
        return np.mean(word2vec_model[words], axis=0)
    else:
        return []


def sent_vectorizer_conceptnet(sent, model):
    sent_vec = []
    numw = 0
    for w in sent:
        try:
            w = '/c/en/'+w
            if numw == 0:
                sent_vec = model.wv[w]
            else:
                sent_vec = np.add(sent_vec, model[w])
            numw += 1
        except:
          pass

    return np.asarray(sent_vec) / numw


def data_vectorizer_conceptnet(sents, model):
    data_vec = []
    for sent in sents:
        data_vec.append(sent_vectorizer_conceptnet(sent, model))
    return data_vec


def get_post_hit_rate__conceptnet(model, set):
    hits = [1 for i in set if ('/c/en/'+i) in model.vocab.keys()]
    return sum(hits)/len(set)

Here to start our test

In [8]:
vector = conceptnet

In [37]:
path = 'conceptnet.txt'
size = 0

print(vector.vector_size)
datas = load_dataset(DATA_PATH)
# clean data
data = []
for value in datas:
    words = word_tokenize(value[0])
    if len(get_mean_vector_conceptnet(vector, words)) >= 1:
        data.append(value)
print("total remove:" + str(len(datas) - len(data)))


300
total remove:3451


In [40]:
# 3.2
(X_train, X_test, y_train_e, y_test_e), (_, _, y_train_s, y_test_s) = split_data(data, tokenize=True, size_of_data=size)

print("The size of the tokens is: ", len(X_train))
# Compute the embedding of a Reddit post as the average of the embeddings of its words. If
# a word has no embedding in Word2Vec, skip it.
sentence_embeddings = get_mean_vector_conceptnet(vector, X_train[0])
average = sum(sentence_embeddings) / len(sentence_embeddings)
print(average)
# 3.3
embeddings_train = []
for post in X_train:
    embedding = get_mean_vector_conceptnet(vector, post)
    embeddings_train.append(sum(embedding) / len(embedding) if len(embedding) != 0 else 0)

print(embeddings_train[0:100])

# 3.4
# Compute and display the overall hit rates of the training and test sets
train_hit_rates = []
for post in X_train[0:100]:
    train_hit_rates.append(get_post_hit_rate__conceptnet(vector, post))
print("The hit rate of the training set is: ", sum(train_hit_rates) / len(train_hit_rates))
test_hit_rates = []
for post in X_test[0:100]:
    test_hit_rates.append(get_post_hit_rate__conceptnet(vector, post))
print("The hit rate of the test set is: ", sum(test_hit_rates) / len(test_hit_rates))

# 3.5

X_train = data_vectorizer_conceptnet(X_train, vector)
X_test = data_vectorizer_conceptnet(X_test, vector)
# X_train = embeddings_train
# X_test = embeddings_test
# test = 0
# for value in X_train:
#   if test == 0:
#     test = value.shape
#   elif value.shape != test:
#     print(value.shape)
#     break
with open(path, "w") as f:
    f.write("")
model = MLPClassifier(max_iter=1)
base_MLP_e = train_models(model, X_train, y_train_e)
save_performance(base_MLP_e, X_test, y_test_e, "base_MLP_e", path=path)
base_MLP_s = train_models(model, X_train, y_train_s)
save_performance(base_MLP_s, X_test, y_test_s, "base_MLP_s", path=path)
# print(type(base_MLP_e), type(top_MLP_e), type(base_MLP_s), type(top_MLP_s))

The size of the tokens is:  134695
0.0027759999786455105
[0.0027759999786455105, 0.004597627392649883, 0.0033557272900543466, 0.0015531779600132722, 0.0044381112747153865, 0.0020619722680082003, 0.00481611127769914, 0.0012065642263405607, 0.004190333309622171, 0.004024298268608012, 0.004791916652272145, 0.004086871575309487, 0.0026020416830821584, 0.0037949258562154377, 0.004440216741713811, 0.0037962664864001757, 0.002431458255353694, 0.006882000059680043, 0.004750749942128702, 0.004724740783591794, 0.001648812396354818, 0.005697533220087887, 0.0034296665106133637, 0.004309933328089149, 0.004193566677116299, 0.0035174919717246666, 0.0030113809492572073, 0.005124638830602634, 0.005638833368041863, 0.002532666773937914, 0.004236703699534701, 0.0005683334742692144, 0.00185791682296743, 0.0015594665213332821, 0.004577950109863498, 0.0034617110943023967, 0.00018124986168307564, 0.0029343332268823965, 0.004297666730417404, 0.004645846212194253, 0.0031110000334835302, 0.0023491904968326103, 

In [44]:
twitter = api.load("glove-twitter-200")



In [45]:
vector = twitter

In [47]:
path = 'twitter.txt'
size = 0

print(vector.vector_size)
datas = load_dataset(DATA_PATH)
# clean data
data = []
for value in datas:
    words = word_tokenize(value[0])
    if len(get_mean_vector(vector, words)) >= 1:
        data.append(value)
print("total remove:" + str(len(datas) - len(data)))

200
total remove:891


In [48]:
# 3.2
(X_train, X_test, y_train_e, y_test_e), (_, _, y_train_s, y_test_s) = split_data(data, tokenize=True, size_of_data=size)

print("The size of the tokens is: ", len(X_train))
# Compute the embedding of a Reddit post as the average of the embeddings of its words. If
# a word has no embedding in Word2Vec, skip it.
sentence_embeddings = get_mean_vector(vector, X_train[0])
average = sum(sentence_embeddings) / len(sentence_embeddings)
print(average)
# 3.3
embeddings_train = []
for post in X_train:
    embedding = get_mean_vector(vector, post)
    embeddings_train.append(sum(embedding) / len(embedding) if len(embedding) != 0 else 0)

print(embeddings_train[0:100])

# 3.4
# Compute and display the overall hit rates of the training and test sets
train_hit_rates = []
for post in X_train[0:100]:
    train_hit_rates.append(get_post_hit_rate(vector, post))
print("The hit rate of the training set is: ", sum(train_hit_rates) / len(train_hit_rates))
test_hit_rates = []
for post in X_test[0:100]:
    test_hit_rates.append(get_post_hit_rate(vector, post))
print("The hit rate of the test set is: ", sum(test_hit_rates) / len(test_hit_rates))

# 3.5

X_train = data_vectorizer(X_train, vector)
X_test = data_vectorizer(X_test, vector)
# X_train = embeddings_train
# X_test = embeddings_test
# test = 0
# for value in X_train:
#   if test == 0:
#     test = value.shape
#   elif value.shape != test:
#     print(value.shape)
#     break
with open(path, "w") as f:
    f.write("")
model = MLPClassifier(max_iter=1)
base_MLP_e = train_models(model, X_train, y_train_e)
save_performance(base_MLP_e, X_test, y_test_e, "base_MLP_e", path=path)
base_MLP_s = train_models(model, X_train, y_train_s)
save_performance(base_MLP_s, X_test, y_test_s, "base_MLP_s", path=path)
# print(type(base_MLP_e), type(top_MLP_e), type(base_MLP_s), type(top_MLP_s))

The size of the tokens is:  136743
-0.011200135269027668
[-0.011200135269027668, -0.0199887799289354, -0.016347183499019594, -0.009659170810336945, -0.027371308133297134, 8.109590504318476e-05, -0.017887009701225906, -0.014627880028565415, -0.025901396111876238, -0.030382947868201882, -0.019449520221096462, -0.013652972058043816, -0.01741443986971717, -0.006451923691201955, -0.014489166623588972, -0.02277974577387795, -0.014183362330659293, -0.03476949436575524, -0.020103628892757117, -0.01152558743255213, -0.009663611768046395, -0.042000598944650844, -0.03115049835469108, -0.019926364669809116, -0.014225139124318957, -0.018480668241390958, -0.014032614911266138, -0.028642209699610247, -0.03616293186852999, -0.019387232013395986, -0.007458210779295768, -0.00347560808993876, -0.014756072229938582, -0.027362000172579427, -0.018786824482958764, -0.02176213920596638, -0.02048862477298826, -0.0168986480939202, -0.021829316731309518, -0.026763648065389133, -0.018884597635769752, -0.040593735