In [4]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [5]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [6]:
# General imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [7]:
    def __init__(self, name, X, y, features, samplingModel=None):
        self.name = name
        self.X = X
        self.y = y
        self.features = features
        self.samplingModel = samplingModel #defaulted to None as it's not required
        self.results = None

In [9]:
class ModelClass:
    def __init__(self, name, actualModel):
        self.name = name
        self.actualModel = actualModel

In [10]:
# This global variable stores the names of the metric types that result from the classification models used in this project
g_orderResults = ['Accuracy', 'Precision', 'Recall', 'F_Measure', 'Cross_Validation']

# This global variable stores the number of metric types that are collected. Later, this global variable is very useful 
# for helping functions iterate through numpy array columns that contain the metric types from g_orderResults
g_numResults = len(g_orderResults)

# This global variable is a list that will store the classification models used in this project
g_modelClasses = []

# This global variable stores the different train/test splits that we will be performed on the datasets used in this project
# A 0.2 split results in 20% of the dataset becoming the test dataset and 80% of the dataset becoming the training dataset.
g_splits = np.array([0.2, 0.4, 0.6, 0.8])

# This global variable stores the sampler models that will balance our datasets. g_underSampleModels will store all the under sampling
# models. g_overSampleModels will store all the over sample models. g_comboSampleModels will store all the sample models that
# use a combination of under and over sampling methods. 
g_underSampleModels = []
g_overSampleModels = []
g_comboSampleModels = []

In [11]:
# Pre: A valid file path for a CSV file is passed in with the ignoreKey flag
# Post: 1.) The CSV is read from the file (the first column (k1) if ignoreKey = true)
#       2.) Each line (or instance) is read from disk. (k1, f1, f2, f3,...,fn,c)
#       3.) All features ([k1][f1:fn]) are normalized and stored in the X array.
#       4.) The last column is stored in the y array (representing the class)
#       5.) A DatasetClass object is returned, representing the dataset.
# @returns: An instance of a DatasetClass object.
def load_csv_data(csv_file_path, ignoreKey):
    #read the data into a panda data structure
    data = pd.read_csv(csv_file_path)                   
    
    #rip out the X values (everything except the last column)
    startIndex = 0
    #ignore the first column (ie the key) if the ignoreKey flag is true
    if ignoreKey:
        startIndex = 1
    rawX = data.iloc[:,startIndex:-1].values  # -1 <-- leave the last column for y
    #normalize the X data
    maxX = np.max(rawX, axis=0)
    minX = np.min(rawX, axis=0)
    avgX = np.mean(rawX, axis=0)
    X = (rawX-minX)/(maxX-minX)

    #Get the class 
    y = data.iloc[:,-1:].values
    # check the shape of the class (single dimension matching the number of observations)
    y = np.ravel(y)
    # remove CSV from the filename, as we will use this as the dataset name
    NameNoCSV = csv_file_path[:-4]
    # stash everything in our custom DatasetClass
    dsc = DatasetClass(NameNoCSV, X, y, data.columns[startIndex:-1])
    
    return dsc