In [12]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from threading import Thread
from pprint import pprint as pp
import time

In [13]:
data = pd.read_csv("../data/deep-scenario.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33530 entries, 0 to 33529
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Execution                      33530 non-null  int64  
 1   ScenarioID                     33530 non-null  object 
 2   Configuration_API_Description  33530 non-null  object 
 3   Attribute[TTC]                 33530 non-null  float64
 4   Attribute[DTO]                 33530 non-null  float64
 5   Attribute[Jerk]                33530 non-null  float64
 6   Attribute[COL]                 33530 non-null  bool   
 7   Attribute[COLT]                33530 non-null  object 
 8   Attribute[SAC]                 33530 non-null  float64
 9   reward                         33530 non-null  object 
 10  road                           33530 non-null  object 
 11  strategy                       33530 non-null  object 
 12  scenario                       33530 non-null 

In [14]:
class DataManipulation():
    def __init__(self, filename: str=""):
        self.data = filename


    @property
    def data(self):
        return self._data
    
    @data.setter
    def data(self, filename: str=""):
        try:
            self._data = pd.read_csv(filename)
        except:
            self._data = None
            raise FileNotFoundError(f"The file does not exist.")


    def addFromXML(self, filename: str="") -> None:
        """
        Reads more data from XML files, as of now, only speeds at six different timstamps.

        Params:
            filename: str, name of file read from
        """
        try:
            xmlDf = pd.read_csv(filename, index_col=0)
        except:
            raise FileNotFoundError(f"The file does not exist.")
        if isinstance(self.data, pd.DataFrame):
            self._data = self.data.merge(xmlDf, how="inner", on=["ScenarioID", "road", "reward", "scenario", "strategy"], copy=False)
        else:
            print("Something went wrong in 'addFromXML()'!")


    def splitTrainTest(self, splitRatio: float=0.8) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """
        Splitting the data.

        Params:
            filename: str, name of file to split
            splitRatio: float 0-1, % of data to be testing

        TODO
            Slå sammen road, scenario, strategy og reward til noe brukbart

        Returns:
            trainX, trainY, testingX, testingY.
        """
        if not 0 < splitRatio < 1: raise ValueError("SplitRatio must be between 0 and 1!")

        # Shuffle data
        self._data = self.data.sample(frac=1, random_state=1)

        temp = self._data.copy()
        # Removing all non numeric values, might need to make string values into numbers
        temp = temp.drop(["Execution","ScenarioID","Configuration_API_Description"], axis=1)
        split = int(np.floor(len(self.data)*splitRatio))
        print(f"splitting at {split}.")

        trainX, testX = temp[:split], temp[split:]

        trainY = pd.concat([trainX.pop(feature) for feature in ["Attribute[COL]","Attribute[COLT]","Attribute[SAC]"]], axis=1)
        testY = pd.concat([testX.pop(feature) for feature in ["Attribute[COL]","Attribute[COLT]","Attribute[SAC]"]], axis=1)

        return trainX, trainY, testX, testY


    def getCompleteRow(self, index: None):
        """
        Gets the origianl row from index(es).

        Params:
            index: int or list

        Return:
            Dataframe or Series
        """
        if isinstance(index, list) or isinstance(index, int):
            return self._data.loc[index]
        return "Something went wrong."


    def getOriginalPath(self, index: int):
        """
        Gets the complete path from where the row was originally collected together with its ScenarioID.

        Params:
            index: int

        Return:
            dict{"SenarioID": str, "path": str}
        """
        if isinstance(index, int):
            row = self.getCompleteRow(index)
            return {
                "ScenarioID": row["ScenarioID"],
                "path": f"{row['strategy']}-strategy/reward-{row['reward']}/{row['road']}-{row['scenario']}-scenario-attributes.csv"
                }
        raise ValueError("Index needs to be an integer!")


In [15]:
dm = DataManipulation("../data/deep-scenario.csv")
# dm.data = ("../data/deep-scenario.csv")
dm.addFromXML("../data/dataFromXML.csv")

# print(min(dm.data["speed3"]))

trainX, trainY, testX, testY = dm.splitTrainTest()
print(min(trainX["speed3"]))
print(f"trainX:{trainX.shape}, trainY:{trainY.shape}, testX:{testX.shape}, testY:{testY.shape}")
print(trainX.head())
# trainX.hist(bins=500, figsize=(20,15))
max(trainX["Attribute[DTO]"])

d = pd.concat([trainX, trainY], axis=1)
df = d.groupby(['strategy', 'Attribute[COL]']).size()
df = df.unstack()
# Very few collisions 
print(df)
d.corr()



splitting at 26824.
0.0
trainX:(26824, 13), trainY:(26824, 3), testX:(6706, 13), testY:(6706, 3)
       Attribute[TTC]  Attribute[DTO]  Attribute[Jerk] reward   road  \
23853   100000.000000        6.434714             5.04    ttc  road2   
30032        0.695324        5.552715             5.74   jerk  road2   
11507   100000.000000        8.511993             5.90    ttc  road3   
17653        0.760641        7.845926            14.30    dto  road4   
16680        8.738029       14.960495             0.88    dto  road3   

       strategy     scenario  speed1  speed2  speed3  speed4  speed5  speed6  
23853    random   rain_night   5.547   4.660   4.401   4.228   3.986   3.746  
30032  rl_based  sunny_night   5.558   4.607   4.358   4.832   4.246   3.967  
11507    greedy   rain_night   0.908   0.001   0.789   0.687   0.001   1.000  
17653    random  sunny_night   5.127   6.324   6.615   8.165   8.313   8.683  
16680    random  sunny_night   8.075   8.280   8.360   8.458   8.566   8.75

Unnamed: 0,Attribute[TTC],Attribute[DTO],Attribute[Jerk],speed1,speed2,speed3,speed4,speed5,speed6,Attribute[COL],Attribute[SAC]
Attribute[TTC],1.0,0.100788,-0.144095,-0.253022,-0.240292,-0.193335,-0.147007,-0.108904,-0.084859,-0.047727,-0.048451
Attribute[DTO],0.100788,1.0,-0.03613,-0.07911,-0.064905,-0.033969,-0.00466,0.017686,0.032633,-0.011135,-0.008658
Attribute[Jerk],-0.144095,-0.03613,1.0,0.220794,0.207929,0.138693,0.079876,0.040943,0.017524,0.099308,0.095497
speed1,-0.253022,-0.07911,0.220794,1.0,0.965072,0.873227,0.7477,0.635589,0.54847,0.113364,0.108645
speed2,-0.240292,-0.064905,0.207929,0.965072,1.0,0.930704,0.817816,0.71463,0.633528,0.093444,0.090953
speed3,-0.193335,-0.033969,0.138693,0.873227,0.930704,1.0,0.935293,0.853828,0.778939,0.033763,0.077092
speed4,-0.147007,-0.00466,0.079876,0.7477,0.817816,0.935293,1.0,0.950989,0.888524,-0.041214,0.052189
speed5,-0.108904,0.017686,0.040943,0.635589,0.71463,0.853828,0.950989,1.0,0.960588,-0.101034,0.006146
speed6,-0.084859,0.032633,0.017524,0.54847,0.633528,0.778939,0.888524,0.960588,1.0,-0.132265,-0.024836
Attribute[COL],-0.047727,-0.011135,0.099308,0.113364,0.093444,0.033763,-0.041214,-0.101034,-0.132265,1.0,0.776937


In [16]:
class Predicter():
    """
    NOTE: Should try other models and should do something about TTC, some are < 15, many are 100000.
    """
    def __init__(self, solver: str="adam", learning_rate: str="constant", activation: str= "relu"):
        self.model = MLPClassifier(random_state=1, solver=solver, activation=activation, learning_rate=learning_rate)


    def preProcess(self, x=None, y=None, targetCol="Attribute[COL]"):
        """
        Process x and y by transforming to np.array and mean scale x.
        
        Params:
            x: dataframe
            y: dataframe
            targetCol: str, name of column that is to be predicted
        
        Returns:
            x: np.array
            y: np.array
        """
        if isinstance(x, pd.DataFrame):
            # only accepts numeric values in training as of now
            for c in x.columns:
                if x[c].dtype != float:
                    x = x.drop(c, axis=1)
            x.loc[x["Attribute[TTC]"] > 20, "Attribute[TTC]"] = -1 # NOTE May be transformed to something else
            x.loc[x["Attribute[DTO]"] > 20, "Attribute[DTO]"] = -1
            # print(x.columns)
            x = x.to_numpy()
            scaler = preprocessing.StandardScaler().fit(x) # Scaling the input
            x = scaler.transform(x)

        if isinstance(y, pd.DataFrame):
            if targetCol and targetCol in y.columns:
                y = y[targetCol]
                y.replace(False, 0, inplace=True)
                y.replace(True, 1, inplace=True)
                y = y.to_numpy()
            else:
                print("Wrong parameters was sent in!")
        
        return x, y
        

    def fit(self, x, y):
        """
        Train the model.

        Params:
            x: np.array, preprocessed training data
            y: np.array, preprocessed training truth
        """
        self.model.fit(x, y)


    def predict(self, x):
        """
        Get predictins from the model.

        Params:
            x: Dataframe, what to predict

        Returns:
            predictions: np.array of 0 and 1
        """
        return self.model.predict(x)


    def getCM(self, predictions, truth):
        """"
        Shows the score of given predictions and ground truth in a confusion matrix.
        
        Prints as follows:
            True negative | False positive

            False negative | True positive

        Params:
            predictions: np.array
            truth: np.array
        
        Returns:
            cm: list[list]
        """
        _, truthProcessed = self.preProcess(y=truth)
        tot = 0
        cm = [[0, 0], [0, 0]]
        # col = np.count_nonzero(truthProcessed == 1)
        # print(f"Total: {tot}, number of collisions: {col}")
        
        for p, t in zip(predictions, truthProcessed):
            cm[t][p] += 1
            tot += 1
        
        return cm
    

    @staticmethod
    def printScore(cm: list[list], modelInfo: list=None, printing: bool=True) -> dict:
        """
        Prints and returns different scoring metrics.

        Params:
            cm: list[list], 2x2 cononfusion matrix
            modelInfo: list, [solver, activation, learning_rate], variables for the model
            printing: bool, to print or not

        Returns:
            dict: accuracy, precision, recall, F1
        """
        def calc(top, bot):
            if bot == 0:
                return 0
            else:
                return top/bot
        s = {}
        s["accuracy"] = round(calc(cm[0][0]+cm[1][1], cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1]), 2)
        s["precision"] = round(calc(cm[1][1], cm[0][1]+cm[1][1]), 2)
        s["recall"] = round(calc(cm[1][1], cm[1][1]+cm[1][0]), 2)
        s["f1"] = round(calc(2*s["precision"]*s["recall"], s["precision"]+s["recall"]), 2)
        if printing:
            if modelInfo:
                print(f"\nSolver: {modelInfo[0]}, activation: {modelInfo[1]}, learning rate: {modelInfo[2]}")
            print(f"\tTN: {cm[0][0]} \t| FP: {cm[0][1]} \n\tFN: {cm[1][0]} \t| TP: {cm[1][1]}")
            print(f"Accuracy: {s['accuracy']}")
            print(f"Precision: {s['precision']}")
            print(f"Recall: {s['recall']}")
            print(f"F1: {s['f1']}")
        return s


In [17]:
p = Predicter()

x, y = p.preProcess(trainX, trainY)
p.fit(x, y)

testXp, testYp = p.preProcess(testX, testY)
pred = p.predict(testXp)
cm = p.getCM(pred, testYp)
p.printScore(cm)


	TN: 6475 	| FP: 16 
	FN: 103 	| TP: 112
Accuracy: 0.98
Precision: 0.88
Recall: 0.52
F1: 0.65




{'accuracy': 0.98, 'precision': 0.88, 'recall': 0.52, 'f1': 0.65}

In [18]:
def his(column, name):
    plt.hist(column, bins=100)
    plt.title(name)
    plt.show()

names = ['Attribute[TTC]', 'Attribute[DTO]', 'Attribute[Jerk]', 'speed1', 'speed2', 'speed3', 'speed4', 'speed5', 'speed6']
# for i in range(len(x[0,:])):
#     print(names[i], min(x[:,i]), max(x[:,i]))
#     his(x[:,i], names[i])
    

In [19]:
solver = ["lbfgs", "sgd", "adam"]
activation = ["identity", "logistic", "tanh", "relu"]
learning_rate = ["constant", "invscaling", "adaptive"]

modelList = []

def useModel(s, l, a):
    m = Predicter(s, l, a)
    m.fit(x, y)
    pred = m.predict(testXp)
    # print(f"\nSolver: {s}, activation: {a}, learning rate: {l}")
    cm = m.getCM(pred, testYp)
    scores = m.printScore(cm, [s, a, l], False)
    modelList.append({"params": {"solver": s, "activation": a, "learning_rate": l}, "cm": cm, "scores": scores})

threadlist = []

for s in solver:
    for a in activation:
        for l in learning_rate:
            threadlist.append(Thread(target=useModel, args=(s, l, a)))

for t in threadlist:
    t.start()

start = time.time()
for i, t in enumerate(threadlist, start=1):
    t.join()
    print(f"Done with thread {i}. Used: {round(time.time()-start, 4)} seconds.")


Done with thread 1. Used: 4.2575 seconds.
Done with thread 2. Used: 4.5925 seconds.
Done with thread 3. Used: 4.5995 seconds.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

Done with thread 4. Used: 60.7025 seconds.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Done with thread 5. Used: 61.4725 seconds.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Done with thread 6. Used: 61.7535 seconds.
Done with thread 7. Used: 61.7555 seconds.
Done with thread 8. Used: 61.7615 seconds.
Done with thread 9. Used: 61.9225 seconds.
Done with thread 10. Used: 61.9275 seconds.
Done with thread 11. Used: 61.9295 seconds.
Done with thread 12. Used: 61.9305 seconds.
Done with thread 13. Used: 99.1229 seconds.
Done with thread 14. Used: 117.6812 seconds.
Done with thread 15. Used: 154.5839 seconds.
Done with thread 16. Used: 241.5757 seconds.
Done with thread 17. Used: 241.5797 seconds.
Done with thread 18. Used: 291.4583 seconds.
Done with thread 19. Used: 291.4613 seconds.
Done with thread 20. Used: 291.4613 seconds.
Done with thread 21. Used: 291.4613 seconds.
Done with thread 22. Used: 291.4643 seconds.
Done with thread 23. Used: 291.4643 seconds.




Done with thread 24. Used: 295.01 seconds.
Done with thread 25. Used: 295.01 seconds.
Done with thread 26. Used: 295.01 seconds.
Done with thread 27. Used: 295.01 seconds.




Done with thread 28. Used: 318.46 seconds.
Done with thread 29. Used: 318.56 seconds.
Done with thread 30. Used: 318.56 seconds.
Done with thread 31. Used: 318.56 seconds.
Done with thread 32. Used: 318.56 seconds.
Done with thread 33. Used: 318.56 seconds.
Done with thread 34. Used: 318.56 seconds.




Done with thread 35. Used: 318.816 seconds.
Done with thread 36. Used: 318.816 seconds.


In [21]:
sortedModels = sorted(modelList, key=lambda val: (val["cm"][1][1], val["scores"]["f1"]), reverse=True)
for m in sortedModels:
    pp(m)

{'cm': [[6463, 28], [91, 124]],
 'params': {'activation': 'relu',
            'learning_rate': 'adaptive',
            'solver': 'lbfgs'},
 'scores': {'accuracy': 0.98, 'f1': 0.68, 'precision': 0.82, 'recall': 0.58}}
{'cm': [[6463, 28], [91, 124]],
 'params': {'activation': 'relu',
            'learning_rate': 'invscaling',
            'solver': 'lbfgs'},
 'scores': {'accuracy': 0.98, 'f1': 0.68, 'precision': 0.82, 'recall': 0.58}}
{'cm': [[6463, 28], [91, 124]],
 'params': {'activation': 'relu',
            'learning_rate': 'constant',
            'solver': 'lbfgs'},
 'scores': {'accuracy': 0.98, 'f1': 0.68, 'precision': 0.82, 'recall': 0.58}}
{'cm': [[6452, 39], [92, 123]],
 'params': {'activation': 'tanh',
            'learning_rate': 'invscaling',
            'solver': 'lbfgs'},
 'scores': {'accuracy': 0.98, 'f1': 0.65, 'precision': 0.76, 'recall': 0.57}}
{'cm': [[6452, 39], [92, 123]],
 'params': {'activation': 'tanh',
            'learning_rate': 'constant',
            'solver':