In [14]:
from utils.dataset import DataSet
from utils.generate_test_splits import split
from os import path
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import pylab as py
from scipy.sparse import hstack
from scipy.sparse import coo_matrix
from tqdm import tqdm
from scipy import sparse
import csv, random, numpy, score, os, re, nltk, scipy, gensim
from sklearn.neural_network import MLPClassifier
from sklearn import tree
from langdetect import detect
from sklearn.ensemble import RandomForestClassifier

In [15]:
dataset = DataSet()
lemmatizer = nltk.WordNetLemmatizer()

Reading dataset
Total stances: 49972
Total bodies: 1683


In [16]:
def get_bodies(data):
	bodies = []
	for i in range(len(data)):
		bodies.append(dataset.articles[data[i]['Body ID']])	
	return bodies

In [17]:
# Get the headlines of training data points
def get_headlines(data):
	headlines = []
	for i in range(len(data)):
		headlines.append(data[i]['Headline'])
	return headlines

In [18]:
# Function for extracting tf-idf vectors (for both the bodies and the headlines).
def extract_tfidf(training_headlines, training_bodies, dev_headlines, dev_bodies, test_headlines, test_bodies):
	# Body vectorisation
	body_vectorizer = TfidfVectorizer(ngram_range=(1, 2), lowercase=True, stop_words='english')#, max_features=1024)
	bodies_tfidf = body_vectorizer.fit_transform(training_bodies)

	# Headline vectorisation
	headline_vectorizer = TfidfVectorizer(ngram_range=(1, 2), lowercase=True, stop_words='english')#, max_features=1024)
	headlines_tfidf = headline_vectorizer.fit_transform(training_headlines)

	# Tranform dev/test bodies and headlines using the trained vectorizer (trained on training data)
	bodies_tfidf_dev = body_vectorizer.transform(dev_bodies)
	headlines_tfidf_dev = headline_vectorizer.transform(dev_headlines)

	bodies_tfidf_test = body_vectorizer.transform(test_bodies)
	headlines_tfidf_test = headline_vectorizer.transform(test_headlines)

	# Combine body_tfdif with headline_tfidf for every data point. 
	training_tfidf = scipy.sparse.hstack([bodies_tfidf, headlines_tfidf])
	dev_tfidf = scipy.sparse.hstack([bodies_tfidf_dev, headlines_tfidf_dev])
	test_tfidf = scipy.sparse.hstack([bodies_tfidf_test, headlines_tfidf_test])

	return training_tfidf, dev_tfidf, test_tfidf

In [19]:
def extract_features(train, dev, test):
	# Get bodies and headlines for dev and training data
	training_bodies = get_bodies(training_data)
	training_headlines = get_headlines(training_data)
	dev_bodies = get_bodies(dev_data)
	dev_headlines = get_headlines(dev_data)
	test_bodies = get_bodies(test_data)
	test_headlines = get_headlines(test_data)

	# Extract tfidf vectors
	print("\t-Extracting tfidf vectors..")
	training_tfidf, dev_tfidf, test_tfidf = extract_tfidf(training_headlines, training_bodies, dev_headlines, dev_bodies, test_headlines, test_bodies)

	return training_tfidf, dev_tfidf, test_tfidf

## Loading Data

In [20]:
data_splits = split(dataset)

In [21]:
training_data = data_splits['training']
dev_data = data_splits['dev']
test_data = data_splits['test']

In [22]:
N = int(len(training_data) * 1.0)
training_data = training_data[:N]
print("\t-Training size:\t", len(training_data))
print("\t-Dev size:\t", len(dev_data))
print("\t-Test data:\t", len(test_data))

	-Training size:	 40106
	-Dev size:	 4835
	-Test data:	 5031


## Extracting Features

In [23]:
training_features, dev_features, test_features = extract_features(training_data, dev_data, test_data)

	-Extracting tfidf vectors..


## Random Forrest Classifier

In [24]:
lr = RandomForestClassifier(n_estimators=10, random_state=12345)
targets_tr = [a['Stance'] for a in training_data]
targets_dev = [a['Stance'] for a in dev_data]
targets_test = [a['Stance'] for a in test_data]
lr.fit(training_features, targets_tr)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=12345,
                       verbose=0, warm_start=False)

## Prediction

In [25]:
y_pred = lr.predict(test_features)

## Evaluation Metrics

In [26]:
print("Confusion matrix")
score.report_score(targets_test, y_pred)

Confusion matrix
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    95     |     1     |    100    |    128    |
-------------------------------------------------------------
| disagree  |    20     |    21     |    25     |    23     |
-------------------------------------------------------------
|  discuss  |    68     |    12     |    652    |    265    |
-------------------------------------------------------------
| unrelated |    267    |     4     |    539    |   2811    |
-------------------------------------------------------------
Score: 1527.25 out of 2315.25	(65.96479861785984%)
