In [1]:
import nltk, os, glob
import pandas as pd
from normalization import normalize_corpus, tokenize_text
import numpy as np
import codecs
from sklearn.datasets.base import Bunch
from sklearn.cross_validation import train_test_split
from feature_extractors import bow_extractor, tfidf_extractor
from feature_extractors import averaged_word_vectorizer
from feature_extractors import tfidf_weighted_averaged_word_vectorizer
import nltk
import gensim
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
import re, json
import warnings
warnings.filterwarnings('ignore')



In [2]:
rootDir = '../..'
dataDir = os.path.join(rootDir, 'data')
assert os.path.exists(dataDir)
rawDir = os.path.join(dataDir, 'raw_data')
assert os.path.exists(rawDir)

TARGET_FIELD = 'extrovert'

In [3]:
def get_data(whichData='train'):
    trainData = pd.read_csv(os.path.join(dataDir, '%sData.csv'%whichData), index_col=0)
    trainTargets = pd.read_csv(os.path.join(dataDir, '%sTargets.csv'%whichData), index_col=0)
    dataset = Bunch()
    dataset.data = np.array([]) 
    dataset.target = np.array([])
    for i, rf in enumerate(trainData['raw_text_file']):
        path = os.path.join(rawDir, rf)
        raw = open(path, encoding='utf-8').read()
        dataset.data = np.append(dataset.data, raw)
    dataset.target = trainTargets[TARGET_FIELD]
    return dataset


def prepare_datasets(corpus, labels, test_data_proportion=0.3):
    train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels, 
                                                        test_size=test_data_proportion, random_state=42)
    return train_X, test_X, train_Y, test_Y

def remove_empty_docs(corpus, labels):
    filtered_corpus = []
    filtered_labels = []
    for doc, label in zip(corpus, labels):
        if doc.strip():
            filtered_corpus.append(doc)
            filtered_labels.append(label)
    return filtered_corpus, filtered_labels

def get_metrics(true_labels, predicted_labels):
    print('Accuracy:', np.round(
                        metrics.accuracy_score(true_labels, predicted_labels),2))
    print('Precision:', np.round(
                        metrics.precision_score(true_labels, predicted_labels, average='weighted'), 2))
    print('Recall:', np.round(
                        metrics.recall_score(true_labels, predicted_labels, average='weighted'), 2))
    print('F1 Score:', np.round(
                        metrics.f1_score(true_labels, predicted_labels, average='weighted'), 2))
    

def train_model(classifier, train_features, train_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    return classifier
    

def train_predict_evaluate_model(classifier, 
                                 train_features, train_labels, 
                                 test_features, test_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    # predict using model
    predictions = classifier.predict(test_features) 
    # evaluate model prediction performance   
    get_metrics(true_labels=test_labels, 
                predicted_labels=predictions)
    return classifier, predictions


## Model space exploration on testData

In [6]:
print('exploring model space on train data...')

exploring model space on train data...


In [7]:
dataset = get_data(whichData='train')
corpus, labels = dataset.data, dataset.target
# len(corpus)
# print(len(labels))
# print(corpus[0][:30])

In [8]:
corpus, labels = remove_empty_docs(corpus, labels)
# print(corpus[0][:30])

In [9]:
print('split the training data into train and test corpus...')
train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(corpus,labels,test_data_proportion=0.3)
# print(train_corpus[0][:30])

split the training data into train and test corpus...


In [10]:
# normalize the corpus
print('normalizing the corpus...')
norm_train_corpus = normalize_corpus(train_corpus)
norm_test_corpus = normalize_corpus(test_corpus)
# print(norm_train_corpus[0][:30])

normalizing the corpus...


In [11]:
# bag of words features
print('BOW feature extraction...')
bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)  
bow_test_features = bow_vectorizer.transform(norm_test_corpus) 

BOW feature extraction...


In [12]:
# tfidf features
print('TFIDF feature extraction...')
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)  
tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus) 

TFIDF feature extraction...


In [13]:
# tokenize documents
tokenized_train = [nltk.word_tokenize(text) for text in norm_train_corpus]
tokenized_test = [nltk.word_tokenize(text) for text in norm_test_corpus]  

In [14]:
# build word2vec model
print('building word2vec model...')
model = gensim.models.Word2Vec(tokenized_train, size=500, window=100, min_count=30, sample=1e-3) 

building word2vec model...


In [15]:
print('averaged word vector features...')
# averaged word vector features
avg_wv_train_features = averaged_word_vectorizer(corpus=tokenized_train,
                                                 model=model,
                                                 num_features=500)                   
avg_wv_test_features = averaged_word_vectorizer(corpus=tokenized_test,
                                                model=model,
                                                num_features=500)   

averaged word vector features...


In [16]:
print('tfidf weighted averaged word vector features...')
# tfidf weighted averaged word vector features
vocab = tfidf_vectorizer.vocabulary_
tfidf_wv_train_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_train, 
                                                                  tfidf_vectors=tfidf_train_features, 
                                                                  tfidf_vocabulary=vocab, 
                                                                  model=model, 
                                                                  num_features=500)
tfidf_wv_test_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_test, 
                                                                 tfidf_vectors=tfidf_test_features, 
                                                                 tfidf_vocabulary=vocab, 
                                                                 model=model, 
                                                                 num_features=500)

tfidf weighted averaged word vector features...


In [17]:
print('initializing two kinds of classifiers: MNB and SVM for trying various types of features...')
print('performance on taining data:')
mnb = MultinomialNB()
svm = SGDClassifier(loss='hinge', n_iter=100, random_state=42)

# Multinomial Naive Bayes with bag of words features
(mnb_bow_model, mnb_bow_predictions) = train_predict_evaluate_model(classifier=mnb,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)
print()
# Support Vector Machine with bag of words features
(svm_bow_model, svm_bow_predictions) = train_predict_evaluate_model(classifier=svm,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)
print()                                    
# Multinomial Naive Bayes with tfidf features                                           
(mnb_tfidf_model, mnb_tfidf_predictions) = train_predict_evaluate_model(classifier=mnb,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)
print()
# Support Vector Machine with tfidf features
(svm_tfidf_model, svm_tfidf_predictions) = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)
print()
# Support Vector Machine with averaged word vector features
(svm_avgwv_model, svm_avgwv_predictions) = train_predict_evaluate_model(classifier=svm,
                                           train_features=avg_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=avg_wv_test_features,
                                           test_labels=test_labels)
print()
# Support Vector Machine with tfidf weighted averaged word vector features
(svm_tfidfwv_model, svm_tfidfwv_predictions) = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_wv_test_features,
                                           test_labels=test_labels)

initializing two kinds of classifiers: MNB and SVM for trying various types of features...
performance on taining data:
Accuracy: 0.46
Precision: 0.76
Recall: 0.46
F1 Score: 0.32

Accuracy: 0.46
Precision: 0.76
Recall: 0.46
F1 Score: 0.32

Accuracy: 0.43
Precision: 0.18
Recall: 0.43
F1 Score: 0.26

Accuracy: 0.54
Precision: 0.68
Recall: 0.54
F1 Score: 0.49

Accuracy: 0.57
Precision: 0.33
Recall: 0.57
F1 Score: 0.42

Accuracy: 0.69
Precision: 0.69
Recall: 0.69
F1 Score: 0.67


In [18]:
# confusion matrix
cm = metrics.confusion_matrix(test_labels, svm_bow_predictions)
# print(pd.DataFrame(cm, index=range(0,2), columns=range(0,2)))

#### we select svm_bow_model after exploration

## Train svm_bow_model on full trainData

In [19]:
print('Selected SVM_BOW model as it performed the best. Training that model on full train data...')

Selected SVM_BOW model as it performed the best. Training that model on full train data...


In [20]:
# train the model on the whole training corpus
dataset = get_data(whichData='train')
corpus, labels = dataset.data, dataset.target
corpus, labels = remove_empty_docs(corpus, labels)
norm_corpus = normalize_corpus(corpus)
bow_vectorizer, bow_features = bow_extractor(norm_corpus)  
svm_bow_model = train_model(classifier=svm,train_features=bow_features, train_labels=labels)

## Submit predictions on testData

In [21]:
print('making predictions on test data...(will throw exception on redacted dataset as testData.csv is unavailable)')

making predictions on test data...(will throw exception on redacted dataset as testData.csv is unavailable)


In [22]:
testDF = pd.read_csv(os.path.join(dataDir, 'testData.csv'), index_col=0)
# print(testDF.shape)
# print(testDF.head())

In [23]:
testdataset = get_data(whichData='test')
testcorpus = testdataset.data
norm_test_corpus = normalize_corpus(testcorpus)

In [24]:
# filter words in test corpus that are not contained in the training vocabulary
filtered_corpus = []    
for text in norm_test_corpus:
    filtered_tokens = []
    tokens = tokenize_text(text)
    for token in tokens:
        if token in bow_vectorizer.vocabulary_:
            filtered_tokens.append(token)
    filtered_text = ' '.join(filtered_tokens)
    filtered_corpus.append(filtered_text)
# print(len(filtered_corpus))

In [25]:
bow_test_features = bow_vectorizer.transform(filtered_corpus) 
df = pd.DataFrame(svm_bow_model.predict(bow_test_features), index=testDF.index)
df.columns = [TARGET_FIELD]
df.to_csv('testTargets.csv')

## Compute performance on testData

In [26]:
print('computing performance on test data...(will throw exception on redacted dataset as testTargets.csv is unavailable)')

computing performance on test data...(will throw exception on redacted dataset as testTargets.csv is unavailable)


In [27]:
y_predict = pd.read_csv('testTargets.csv', index_col=0)
y_truth = pd.read_csv(os.path.join(dataDir,'testTargets.csv'), index_col=0)

In [28]:
# print(y_predict.head())
# print(y_truth.head())

In [29]:
score = np.round(metrics.f1_score(y_truth, y_predict, average='weighted'), 4)
# print(score)

In [30]:
print('performance on test data...')
print('Accuracy:', np.round(metrics.accuracy_score(y_truth, y_predict),2))
print('Precision:', np.round(metrics.precision_score(y_truth, y_predict, average='weighted'), 2))
print('Recall:', np.round(metrics.recall_score(y_truth, y_predict, average='weighted'), 2))
print('F1 Score:', np.round(metrics.f1_score(y_truth, y_predict, average='weighted'), 2))

performance on test data...
Accuracy: 0.62
Precision: 0.62
Recall: 0.62
F1 Score: 0.62


In [31]:
# save the score
score = {'f1':score}
# write to score file
with open('performance.json', 'w') as outfile:
    json.dump(score, outfile)