In [16]:
import json
import os
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import FeatureHasher
DATA_FOLDER = '../reuters-21578-json/data/full/'

In [2]:
def get_json_content_of_files_in_folder(path):
    json_files_collection = []
    json_documents = []
    for file in os.listdir(path):
        json_files_collection.append(json.load(open(path + file)))
        
    for json_files in json_files_collection:
        for json_file in json_files:
            json_documents.append(json_file)
        
    return json_documents

In [3]:
def filter_documents(documents):
    filtered_documents = []
    for document in documents:
        if 'topics' in document and 'body' in document:
            filtered_documents.append(document)
            
    return filtered_documents

In [35]:
def create_bag_of_words(documents):
    vectorizer = CountVectorizer(lowercase=True)
    document_list = []
    for document in documents:
        document_list.append(document['body'])
        
    return vectorizer.fit_transform(document_list).toarray()

In [31]:
def create_feature_hashing(documents):
    feature_hasher = FeatureHasher(n_features=1000, input_type='string')
    document_list = []
    for document in documents:
        document_list.append(document['body'])
        
    return feature_hasher.transform(document_list).toarray()

In [11]:
def get_y_from_documents(documents):
    y = []
    for document in documents:
        if 'earn' in document['topics']:
            y.append(True)
        else:
            y.append(False)
        
    return y

In [12]:
def devide_to_train_and_test_set(X, y, proc_train):
    proc_test = 100 - proc_train
    num_train = round(len(X) / 100 * proc_train)
    num_test = round(len(X) / 100 * proc_test)
    
    return (X[:num_train], y[:num_train], X[num_train:], y[num_train:])

## Preparing the documents

In [19]:
documents = filter_documents( get_json_content_of_files_in_folder(DATA_FOLDER) )

### Running the random forest classifier with bag of words

In [37]:
X_bag_of_words = create_bag_of_words(documents)
y_bag_of_words = get_y_from_documents(documents)
randomForestClassifier = RandomForestClassifier(n_estimators=50)
trainX, trainy, testX, testy = devide_to_train_and_test_set(X_bag_of_words, y_bag_of_words, 80)
randomForestClassifier.fit(trainX, trainy)
print('Score for bag of words: ', randomForestClassifier.score(testX, testy))

Score for bag of words:  0.956626506024


### Running the random forest classifier with feature hashing using 1000 buckets

In [38]:
X_feature_hashing = create_feature_hashing(documents)
y_feature_hashing = get_y_from_documents(documents)
randomForestClassifier = RandomForestClassifier(n_estimators=50)
trainX, trainy, testX, testy = devide_to_train_and_test_set(X_feature_hashing, y_feature_hashing, 80)
randomForestClassifier.fit(trainX, trainy)
print('Score for bag of words: ', randomForestClassifier.score(testX, testy))

Score for bag of words:  0.923373493976
