In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing

In [2]:
data = pd.read_csv("../data/deep-scenario.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33530 entries, 0 to 33529
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Execution                      33530 non-null  int64  
 1   ScenarioID                     33530 non-null  object 
 2   Configuration_API_Description  33530 non-null  object 
 3   Attribute[TTC]                 33530 non-null  float64
 4   Attribute[DTO]                 33530 non-null  float64
 5   Attribute[Jerk]                33530 non-null  float64
 6   Attribute[COL]                 33530 non-null  bool   
 7   Attribute[COLT]                33530 non-null  object 
 8   Attribute[SAC]                 33530 non-null  float64
 9   reward                         33530 non-null  object 
 10  road                           33530 non-null  object 
 11  strategy                       33530 non-null  object 
 12  scenario                       33530 non-null 

In [3]:
class DataManipulation():
    def __init__(self):
        self.__data = None
        # self.__dataDesc = None


    def splitTrainTest(self, filename: str="", splitRatio: float=0.8) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """
        Splitting the data.

        Params:
            filename: str, name of file to split
            splitRatio: float 0-1, % of data to be testing

        TODO
            Slå sammen road, scenario, strategy og reward til noe brukbart

        Returns:
            trainX, trainY, testingX, testingY.
        """
        if not 0 < splitRatio < 1: raise ValueError("SplitRatio must be between 0 and 1!")
        try:
            self.__data = pd.read_csv(filename)
        except:
            raise FileNotFoundError(f"The file does not exist.")

        # self.__dataDesc = pd.concat([self.__data.pop(feature) for feature in ["Execution","ScenarioID","Configuration_API_Description"]], axis=1)

        # Shuffle data
        self.__data = self.__data.sample(frac=1)

        temp = self.__data.copy()
        # Removing all non numeric values, might need to make string values into numbers
        temp = temp.drop(["Execution","ScenarioID","Configuration_API_Description"], axis=1)
        split = int(np.floor(len(self.__data)*splitRatio))
        print(f"splitting at {split}.")

        trainX, testX = temp[:split], temp[split:]

        trainY = pd.concat([trainX.pop(feature) for feature in ["Attribute[COL]","Attribute[COLT]","Attribute[SAC]"]], axis=1)
        testY = pd.concat([testX.pop(feature) for feature in ["Attribute[COL]","Attribute[COLT]","Attribute[SAC]"]], axis=1)

        return trainX, trainY, testX, testY


    def getCompleteRow(self, index: None):
        """
        Gets the origianl row from index(es).

        Params:
            index: int or list

        Return:
            Dataframe or Series
        """
        if index == None and self.__data != None:# and self.__dataDesc != None:
            return self.__dataDesc
        elif isinstance(index, list) or isinstance(index, int):
            return self.__data.loc[index]
        return ("Something went wrong.")


    def getOriginalPath(self, index: int):
        """
        Gets the complete path from where the row was originally collected together with its ScenarioID.

        Params:
            index: int

        Return:
            dict{"SenarioID": str, "path": str}
        """
        if isinstance(index, int):
            row = self.getCompleteRow(index)
            return {
                "ScenarioID": row["ScenarioID"],
                "path": f"{row['strategy']}-strategy/reward-{row['reward']}/{row['road']}-{row['scenario']}-scenario-attributes.csv"
                }
        raise ValueError("Index needs to be an integer!")


In [4]:
dm = DataManipulation()
trainX, trainY, testX, testY = dm.splitTrainTest("../data/deep-scenario.csv")
print(f"trainX:{trainX.shape}, trainY:{trainY.shape}, testX:{testX.shape}, testY:{testY.shape}")
# print(trainX.head())
# trainX.hist(bins=500, figsize=(20,15))

d = pd.concat([trainX, trainY], axis=1)
df = d.groupby(['strategy', 'Attribute[COL]']).size()
df = df.unstack()
# Very few collisions 
print(df)


splitting at 26824.
trainX:(26824, 7), trainY:(26824, 3), testX:(6706, 7), testY:(6706, 3)
Attribute[COL]  False  True 
strategy                    
greedy          10440    161
random          10726    123
rl_based         4817    557


In [80]:
class Predicter():
    """
    NOTE: Should try other models and should do something about TTC, some are < 15, many are 100000.
    """
    def __init__(self):
        self.model = MLPClassifier()


    def preProcess(self, x=None, y=None, targetCol="Attribute[COL]"):
        """
        Process x and y by transforming to np.array and mean scale x.
        
        Params:
            x: dataframe
            y: dataframe
            targetCol: str, name of column that is to be predicted
        
        Returns:
            x: np.array
            y: np.array
        """
        if isinstance(x, pd.DataFrame):
            # only accepts numeric values in training as of now
            for c in x.columns:
                if x[c].dtype != float:
                    x = x.drop(c, axis=1)
            x.loc[x["Attribute[TTC]"]==100000] = -1 # NOTE May be transformed to something else
            x = x.to_numpy()
            scaler = preprocessing.StandardScaler().fit(x) # Scaling the input
            x = scaler.transform(x)

        if isinstance(y, pd.DataFrame):
            if targetCol and targetCol in y.columns:
                y = y[targetCol]
                y.replace(False, 0, inplace=True)
                y.replace(True, 1, inplace=True)
                y = y.to_numpy()
            else:
                print("Wrong parameters was sent in!")
        
        return x, y
        

    def fit(self, x, y):
        """
        Train the model.

        Params:
            x: np.array, preprocessed training data
            y: np.array, preprocessed training truth
        """
        self.model.fit(x, y)


    def predict(self, x):
        """
        Params:
            x: Dataframe, what to predict

        Returns:
            predictions: np.array of 0 and 1
        """
        return self.model.predict(x)


    def getScore(self, predictions, truth):
        """"
        Shows the score of given predictions and ground truth in a confusion matrix.
        
        Prints as follows:
            True negative | False positive

            False negative | True positive

        Params:
            predictions: np.array
            truth: np.array
        
        Returns:
            cm: list[list]
        """
        _, truthProcessed = self.preProcess(y=truth)
        tot = 0
        cm = [[0, 0], [0, 0]]
        col = np.count_nonzero(truthProcessed == 1)
        
        for p, t in zip(predictions, truthProcessed):
            cm[t][p] += 1
            tot += 1
        print(f"Total: {tot}, number of collisions: {col}")
        print(f"\tTN: {cm[0][0]} \t| FP: {cm[0][1]} \n\tFN: {cm[1][0]} \t| TP: {cm[1][1]}")
        return cm

In [82]:
p = Predicter()

x, y = p.preProcess(trainX, trainY)
p.fit(x, y)

testXp, testYp = p.preProcess(testX, testY)
pred = p.predict(testXp)
score = p.getScore(pred, testYp)


Total: 6706, number of collisions: 209
	TN: 6159 	| FP: 338 
	FN: 92 	| TP: 117


In [73]:
p.getScore(pred, testY)

Total: 6706, number of collisions: 209
	TN: 6293 	| FP: 204 
	FN: 121 	| TP: 88
[[6293, 204], [121, 88]]
