In [1]:
# Pre-Processing imports

import pandas as pd
import numpy as np
import glob
import os
import re
import seaborn as sns
import matplotlib.pyplot as plt
from ipywidgets import interact
%matplotlib inline

In [2]:
# ML imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Options

# np.set_printoptions(threshold=np.inf)

In [4]:
# Functions

# Train new model
def train_new_model(path, model):
    
    # Will grab any subfolders from path and their csv files
    all_files = glob.glob(path + "/*/*.csv")

    # Holder list
    li = []
    for filename in all_files:
        # Reads individual csv files
        df = pd.read_csv(filename, index_col=None, header=0)
        # Matches specifics from the filename using regex (subject to change depending on file naming convention)    
        match  = re.findall(r'CORRECT|WRONG|[a-zA-Z]+(?=_[0-9]+_data)|(?<=IR_[0-9]_)[a-zA-Z]+', filename)
        y = (''.join(match))
        # Adds target column for classification
        df['y'] = y
        # Appends the dataframe to the list
        li.append(df)

    # Concats all data into one dataframe for training/testing
    frame = pd.concat(li, axis=0, ignore_index=True)
    
    # Target column is y
    y_string = pd.DataFrame(frame['y']).to_numpy()
    # Changes target from string to numeric
    y = LabelEncoder().fit_transform(y_string.ravel())
    X = frame.drop(['y','arrival_time'],axis=1).to_numpy()

    # Splits data into training and testing with 80% training and 20% testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
    
    clf = model.fit(X_train, y_train)
    print("Model: ",model,"\nScore: ", clf.score(X_test, y_test))

In [5]:
# Path to data directory

# Change to desired path
# Will grab any subfolders from path and their csv files
all_files = glob.glob(path + "/*/*.csv") 

In [6]:
# Reads in all data and adds target column for classification

# Holder list
li = []

for filename in all_files:
    # Reads individual csv files
    df = pd.read_csv(filename, index_col=None, header=0)
    # Matches specifics from the filename using regex (subject to change depending on filenaming convention)    
    match  = re.findall(r'CORRECT|WRONG|[a-zA-Z]+(?=_[0-9]+_data)|(?<=IR_[0-9]_)[a-zA-Z]+', filename)
    y = (''.join(match))
    # Adds target column for classification
    df['y'] = y
    # Appends the dataframe to the list
    li.append(df)

# Concats all data into one dataframe for training/testing
frame = pd.concat(li, axis=0, ignore_index=True)

In [7]:
# Target column is y
y_string = pd.DataFrame(frame['y']).to_numpy()
# Changes target from string to numeric
y = LabelEncoder().fit_transform(y_string.ravel())

X = frame.drop(['y','arrival_time'],axis=1).to_numpy()

# Splits data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [8]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [9]:
clf.score(X_test, y_test)

0.7456972111553785

In [10]:
# Abstracting above code

In [11]:
# Model Options are: 
# DecisionTreeClassifier()
# RandomForestClassifier()
# SVC()

# Path example:
# r'C:\Users\conno\Desktop\Pastebles\data'

In [12]:
# Example of training a new model with its associted testing score
train_new_model(r'C:\Users\conno\Desktop\Pastebles\data', RandomForestClassifier())

Model:  RandomForestClassifier() 
Score:  0.8596414342629483
