In [27]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.dummy import DummyClassifier
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report
data_train = pd.read_json("processed_training.json")
data_val = pd.read_json("processed_validation.json")

In [3]:
# pip install gensim
# Taggeddocument for training data
taggedDocuments_train = []
for index, data_point_post in data_train.iterrows():
    taggedDocument_train =  TaggedDocument(words=data_train.at[index, 'posts'], tags=[data_train.at[index, 'type']])
    taggedDocuments_train.append(taggedDocument_train)

In [5]:
# Taggeddocument for validation data
taggedDocuments_val = []
for index, data_point_post in data_val.iterrows():
    taggedDocument_val =  TaggedDocument(words=data_val.at[index, 'posts'], tags=[data_val.at[index, 'type']])
    taggedDocuments_val.append(taggedDocument_val)

In [7]:
# creating the Dov2Vec model
model = Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(taggedDocuments_train)

model.train(taggedDocuments_train, total_examples=model.corpus_count, epochs=model.epochs)

DONE


In [8]:
model.save('vector_model.doc2vec')

In [10]:
def data_splitter(model, taggedDocuments):
    """
    infering vectors from the model 
    """
    sents = taggedDocuments[0:45865]
    tags, posts = zip(*[(doc.tags[0], model.infer_vector(doc.words)) for doc in sents])
    return tags, posts

In [11]:
# vectorizing the posts and types
type_train, posts_train = data_splitter(model, taggedDocuments_train)
type_val, posts_val = data_splitter(model, taggedDocuments_val)

In [24]:
# Applying logistic regression
# TODO Should we include hyperparam C?

logreg = LogisticRegression(n_jobs=-1, solver='sag')
logreg.fit(posts_train, type_train)

LogisticRegression(n_jobs=-1, solver='sag')

In [26]:
# Validating the (logreg) model
logreg.score()
type_pred = logreg.predict(posts_val)
print(classification_report(type_val, type_pred))

              precision    recall  f1-score   support

           E       0.86      0.86      0.86      3438
           I       0.86      0.86      0.86      3418

    accuracy                           0.86      6856
   macro avg       0.86      0.86      0.86      6856
weighted avg       0.86      0.86      0.86      6856

Testing F1 score: 0.8573514194308273


In [None]:
# loading test set NB ONLY USE AFTER BEST HYPERPARAMETER HAS BEEN CHOSEN
data_test = pd.read_json("processed_test.json")

In [None]:
#Retrains Dov2Vec model
taggedDocuments_train.extend(taggedDocuments_val)
model.train(taggedDocuments_train, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
# Taggeddocument for test data
taggedDocuments_test = []
for index, data_point_post in data_test.iterrows():
    taggedDocument_test =  TaggedDocument(words=data_test.at[index, 'posts'], tags=[data_test.at[index, 'type']])
    taggedDocuments_test.append(taggedDocument_test)
    
type_test, posts_test = data_splitter(model, taggedDocuments_test)

In [None]:
# Applying logistic regression
logreg.fit((posts_train+posts_val), (type_train+type_val))

# Testing the (logreg) model
type_pred = logreg.predict(posts_test)
print(classification_report(type_test, type_pred))

In [38]:
# Dummy classifier as a baseline to prove our model's effectiveness
dummy = DummyClassifier(strategy='uniform', random_state=0)
dummy.fit(posts_train+posts_val, type_train+type_val)

type_pred = dummy.predict(posts_test)
print(classification_report(type_test, type_pred))

0.505513981882631
              precision    recall  f1-score   support

           E       0.50      0.50      0.50      2488
           I       0.52      0.51      0.51      2590

    accuracy                           0.51      5078
   macro avg       0.51      0.51      0.51      5078
weighted avg       0.51      0.51      0.51      5078

