In [1]:
import numpy as np
import pandas as pd
import glob
import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from joblib import dump

In [7]:
## Importing data ##
records = pd.read_csv(r'C:\Users\User\Desktop\Stanley\time60\data\records60.txt', sep = ",", header = None)
labels = pd.read_csv(r'C:\Users\User\Desktop\Stanley\time60\data\labels60.txt', sep = ",", header = None)

records.columns = ["Mean", "Stdev", "Over60", "O60U120", "Over120", "Skewness", "Kurtosis", "Entropy", "Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8", "Q9"]
labels.columns = ['Label']

print(records.notnull().values.all(), records.shape, labels.shape)

False (5974, 17) (5974, 1)


In [8]:
## Processing nan values ##
index = records[records["Kurtosis"].isna()].index
records = records.drop(index)
labels = labels.drop(index)
print(records.notnull().values.all(), records.shape, labels.shape)

True (5973, 17) (5973, 1)


In [9]:
## Splitting training and validation sets ##
train, test, train_labels, test_labels = train_test_split(records, labels,
                                                          test_size=0.2, 
                                                          random_state=42)

In [10]:
## Process and reshaping data to fit the training models ##
reshapedTrainLabels = train_labels.values.reshape(1, -1)
reshapedTrainLabels = reshapedTrainLabels[0]
reshapedTestLabels = test_labels.values.reshape(1, -1)
reshapedTestLabels = reshapedTestLabels[0]

In [11]:
## Training ##
start = time.process_time()

## Create model with 500 trees, max_depth = 3
model = RandomForestClassifier(n_estimators=500, max_depth=3, random_state=0)

## Fit on training data
model.fit(train, reshapedTrainLabels)

elapsed = (time.process_time() - start)
print("Time used:", elapsed)

Time used: 1.28125


In [13]:
## Validation ##
train_rf_probs = model.predict_proba(train)[:, 1]

rf_probs = model.predict_proba(test)[:, 1]
rf_preds = model.predict(test)

print(len(rf_probs), len(test_labels))

1195 1195


In [14]:
df = pd.concat([pd.DataFrame(test_labels.values), pd.DataFrame(rf_preds)], axis=1)
df.columns = ["Labels", "Prediction"]
df

Unnamed: 0,Labels,Prediction
0,1.0,1.0
1,0.0,0.0
2,1.0,1.0
3,0.0,0.0
4,1.0,1.0
5,0.0,0.0
6,1.0,1.0
7,0.0,0.0
8,0.0,0.0
9,1.0,1.0


In [15]:
count = 0.0
for i in range(len(df)):
    if df.iloc[i, 0] == df.iloc[i,1]:
        count = count + 1

print(count / len(df))

1.0


In [16]:
dump(model, 'randomforest60.joblib')

['randomforest60.joblib']