In [None]:
from sklearn import tree
import os # For file reading
import json # For json reading
import re
import itertools

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [None]:
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(n_neighbors=2),
    SVC(kernel="linear", C=0.005),
    SVC(gamma=2, C=0.005),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=100),
    MLPClassifier(),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()
]


RELEVANT_STRING = 'relevant'
IRRELEVANT_STRING = 'irrelevant'

relevant_dir = os.path.abspath(os.curdir) + '/training/relevant/'
irrelevant_dir = os.path.abspath(os.curdir) + '/training/irrelevant/'

test_relevant_dir = os.path.abspath(os.curdir) + '/test/relevant/'
test_irrelevant_dir = os.path.abspath(os.curdir) + '/test/irrelevant/'

file_dir = os.path.abspath(os.curdir) + '/files/'

FEATURES = [
    'privacy',
    'cyber security',
    'egovernment',
    'digital age',
    'digital technologies',
    'cybersecurity',
    'cyber',
    'information security',
    'cybercrime',
]


highest_accuracy = 0
lowest_false_negatives = 9999
lowest_false_positives = 9999


def extract_info(file_data, features):
    return [
          file_data['word_counts'][feature] for feature in features
    ]



def train(features):
    #### Training
    
    X = []
    Y = []

    # Loop through all relevant
    for filename in os.listdir(relevant_dir):

        # Skip directories
        if os.path.isdir(os.path.join(relevant_dir, filename)):
            continue

        # Open every file
        with open(relevant_dir + filename, 'r') as file:
            file_data = json.loads(file.read())

            X.append(extract_info(file_data, features))
            Y.append(RELEVANT_STRING)


    # Loop through all irrelevant
    for filename in os.listdir(irrelevant_dir):

        # Skip directories
        if os.path.isdir(os.path.join(irrelevant_dir, filename)):
            continue


        # Open every file
        with open(irrelevant_dir + filename, 'r') as file:
            file_data = json.loads(file.read())

            X.append(extract_info(file_data, features))
            Y.append(IRRELEVANT_STRING)
    
    return X, Y


def classify(features):        
    #### Classification testing
    X, Y = train(features)
    
    for name, clf in zip(names, classifiers):
        clf = clf.fit(X, Y)

        false_negative = 0
        false_positive = 0

        for filename in os.listdir(test_relevant_dir):

            # Skip directories
            if os.path.isdir(os.path.join(test_relevant_dir, filename)):
                continue

            # Skip test set files
            if os.path.exists(os.path.join(relevant_dir, filename)) or os.path.exists(os.path.join(irrelevant_dir, filename)):
                continue


            # Open every file
            with open(test_relevant_dir + filename, 'r') as file:

                file_data = json.loads(file.read())

                prediction = clf.predict(
                        [extract_info(file_data, features)]
                )

                if not prediction[0] == RELEVANT_STRING:
                    false_negative += 1


        for filename in os.listdir(test_irrelevant_dir):

            # Skip directories
            if os.path.isdir(os.path.join(test_irrelevant_dir, filename)):
                continue

            # Skip test set files
            if os.path.exists(os.path.join(relevant_dir, filename)) or os.path.exists(os.path.join(irrelevant_dir, filename)):
                continue


            # Open every file
            with open(test_irrelevant_dir + filename, 'r') as file:

                file_data = json.loads(file.read())

                prediction = clf.predict(
                        [extract_info(file_data, features)]
                )

                if prediction[0] == RELEVANT_STRING:
                    false_positive += 1
        
        
        accuracy = (len(os.listdir(test_relevant_dir)) + len(os.listdir(test_irrelevant_dir)) - false_negative - false_positive) / (len(os.listdir(test_relevant_dir))+ len(os.listdir(test_irrelevant_dir)))
        
        global highest_accuracy
        global lowest_false_negatives
        global lowest_false_positives
        
        if accuracy >= highest_accuracy:
            
            highest_accuracy = accuracy
            
            print("New Equal/High ACC! For {}: Accuracy is {}. {} false negatives occured. {} false positives occured. Features: {}".format(
                name, 
                str(accuracy),
                str(false_negative),
                str(false_positive),
                features,
            ))
            
            if false_negative < lowest_false_negatives:
                lowest_false_negatives = false_negative
                print("And new lowest false negs: {}".format(false_negative))
            if false_positive < lowest_false_positives:
                lowest_false_positives = false_positive
                print("And new lowest false pos: {}".format(false_positive))
            

                       
def test_classify_all_features():            
    for x in range(1,len(FEATURES) + 1):
        for features in itertools.combinations(FEATURES, x):
            classify(features)
            

            
def classify_whole_data_set(n=5):
    
    features = ['cybersecurity', 'cybercrime', 'digital technologies', 'information security']
    
    relevant = 0
    irrelevant = 0
    for x in range(n):
        
        clf = MLPClassifier()
        X, Y = train(features)
        clf.fit(X, Y)
        
        for filename in os.listdir(file_dir):

                # Skip directories
                if os.path.isdir(os.path.join(file_dir, filename)):
                    continue

                # Skip training set files
                if os.path.exists(os.path.join(relevant_dir, filename)) or os.path.exists(os.path.join(irrelevant_dir, filename)):
                    continue


                # Open every file
                with open(file_dir + filename, 'r') as file:

                    file_data = json.loads(file.read())

                    prediction = clf.predict(
                            [extract_info(file_data, features)]
                    )

                    if prediction[0] == RELEVANT_STRING:
                        relevant += 1
                    else:
                        irrelevant += 1
    
    relevant = relevant / n
    irrelevant = irrelevant / n
    
    
    print("Classified main set!")
    print("{} relevant".format(relevant))
    print("{} irrelevant".format(irrelevant))

    
### RUN program
    
# Uncomment to classify the whole data set, for n times, using the Neural Network classifier and selected features 
# classify_whole_data_set(n=1)

# Uncomment to classify the test set of combination of the different features, printing accuracy results
test_classify_all_features()