In [None]:
from sklearn import tree
import os # For file reading
import json # For json reading
import re

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [None]:
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]


classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()
]


RELEVANT_STRING = 'relevant'
IRRELEVANT_STRING = 'not relevant'

X = []
Y = []


relevant_dir = os.path.abspath(os.curdir) + '/training/relevant/'
irrelevant_dir = os.path.abspath(os.curdir) + '/training/irrelevant/'

test_relevant_dir = os.path.abspath(os.curdir) + '/test/relevant/'
test_irrelevant_dir = os.path.abspath(os.curdir) + '/test/irrelevant/'

file_dir = os.path.abspath(os.curdir) + '/test/irrelevant/'



def extract_info(file_data):

    return [
        file_data['word_counts']['cybersecurity'],
#         file_data['word_counts']['privacy'],
#         int(re.search(r"[0-9]{4}", file_data['document_date']).group(0)),
    ]

#### Training

# Loop through all relevant
for filename in os.listdir(relevant_dir):
    
    # Skip directories
    if os.path.isdir(os.path.join(relevant_dir, filename)):
        continue
    
    # Open every file
    with open(relevant_dir + filename, 'r') as file:
        file_data = json.loads(file.read())
        
        X.append(extract_info(file_data))
        Y.append(RELEVANT_STRING)
    
        
# Loop through all irrelevant
for filename in os.listdir(irrelevant_dir):
    
    # Skip directories
    if os.path.isdir(os.path.join(irrelevant_dir, filename)):
        continue
        
    
    # Open every file
    with open(irrelevant_dir + filename, 'r') as file:
        file_data = json.loads(file.read())
        
        X.append(extract_info(file_data))
        Y.append(IRRELEVANT_STRING)
            

        
#### Classification testing    

for name, clf in zip(names, classifiers):
    clf = clf.fit(X, Y)
    
    false_negative = 0
    false_positive = 0

    for filename in os.listdir(test_relevant_dir):

        # Skip directories
        if os.path.isdir(os.path.join(test_relevant_dir, filename)):
            continue

        # Skip test set files
        if os.path.exists(os.path.join(relevant_dir, filename)) or os.path.exists(os.path.join(irrelevant_dir, filename)):
            continue


        # Open every file
        with open(test_relevant_dir + filename, 'r') as file:

            file_data = json.loads(file.read())

            prediction = clf.predict(
                    [extract_info(file_data)]
            )

            if not prediction[0] == RELEVANT_STRING:
                false_negative += 1
#                 print("{} gave a false negative!".format(filename))


    for filename in os.listdir(test_irrelevant_dir):

        # Skip directories
        if os.path.isdir(os.path.join(test_irrelevant_dir, filename)):
            continue

        # Skip test set files
        if os.path.exists(os.path.join(relevant_dir, filename)) or os.path.exists(os.path.join(irrelevant_dir, filename)):
            continue


        # Open every file
        with open(test_irrelevant_dir + filename, 'r') as file:

            file_data = json.loads(file.read())

            prediction = clf.predict(
                    [extract_info(file_data)]
            )

            if prediction[0] == RELEVANT_STRING:
                false_positive += 1
#                 print("{} gave a false positive!".format(filename))

    print("For {}: Accuracy is {}. {} false negatives occured. {} false positives occured.".format(
        name, 
        str(
            (len(os.listdir(test_relevant_dir)) + len(os.listdir(test_irrelevant_dir)) - false_negative - false_positive) / (len(os.listdir(test_relevant_dir))+ len(os.listdir(test_irrelevant_dir)))
        ),
        str(false_negative),
        str(false_positive),
    ))
    
