In [1]:
import os
import pandas as pd
import numpy as np 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

# load data
import os
import pandas as pd
import numpy as np
# test models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import time
# feature importance plot
import matplotlib.pyplot as plt



another idea was to calculate features of the time series
* number of change of signs for each feature

In [7]:
def linearize(u):
    """
    Fonction pour mettre en 1 seule lignes le tableau de statistiques pd.describe()
    """
    all=[]
    for line in range(len(u)):
        all.append(u.iloc[line])
    return pd.concat(all, axis=0).T

def double_integrate(df, col):
    df.sort_values(by='t', inplace=True)
    velocities = df[col].cumsum()
    positions = velocities.cumsum()
    return positions


def load_data(directory='data/config1', drop_col='', drop_feat='', position=False, puissances=False, change_sign=False,n_segments=1):
    all_data=[]
    for filename in sorted(os.listdir(directory)):
        # lecture fichier
        f = os.path.join(directory, filename)
        data = pd.read_csv(f)
        # ajout des positions
        if position:
            data['pos_x'] = double_integrate(data,'raw_acceleration_x')
            data['pos_y'] = double_integrate(data,'raw_acceleration_y')
            data['pos_z'] = double_integrate(data,'raw_acceleration_z')
        # ajout des puissances
        if puissances:
            data['rotation_cubic_x'] = data['rotation_speed_x']**3
            data['rotation_cubic_y'] = data['rotation_speed_y']**3
            data['rotation_cubic_z'] = data['rotation_speed_z']**3
        # feature selection / engineering 
        if drop_col:
            data.drop(columns=drop_col, inplace=True)
        u = data.describe().T
        if drop_feat:
            u.drop(columns=drop_feat, inplace=True)
        number = pd.DataFrame(linearize(u)).T
        # time segments
        if n_segments >1:
            n = len(data)//n_segments   # nbe de points par quartiers
            quartiles_ =[]
            for i in range(n_segments):
                quart_ = data[i*n:(i+1)*n]#.drop(columns='t')
                int_ = quart_.describe().T
                if drop_feat:
                    int_.drop(columns=drop_feat, inplace=True)
                number = pd.concat([number,pd.DataFrame(linearize(int_)).T],axis=1)
        # changement de signes
        if change_sign:
            for feature in data.columns[1:-1]:  # on ne prend pas le temps ni le label
                col_name = 'sign_change_' + feature
                data[col_name] = data[feature].apply(lambda x: 1 if x >= 0 else -1)
                number[col_name] = (data[col_name] * data[col_name].shift(-1) < 0).sum()
        # get correct label
        if directory == 'data/h_config1-lcb':
            number['label'] = filename[2]
        elif directory == 'data/v_config1-lcb':
            number['label'] = int(filename[10])
        elif directory == 'data/groupe1_groupe2':
            number['groupe'] = filename[1]
            number['label'] = int(filename.split('_')[1])
        else :
            number['label'] = int(filename[0])
        # concatenate
        all_data.append(number)   
    return pd.concat(all_data)#, _.columns, u.columns

def test_models(to_test='to_test'):
    """
    Fonction permettant de tester 4 modèles pour la classification avec 10 split de cross-validation
    to_test : pd.DataFrame()
    """

    sc = StandardScaler()
    X=sc.fit_transform(to_test.drop("label", axis=1))
    y=to_test["label"].astype(int)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=42)
    models=[
        LogisticRegression(solver='liblinear'),
        DecisionTreeClassifier(),
        RandomForestClassifier()
    ]

    for model in models:
        cv =cross_validate(model,X_train,y_train,cv=10)
        print(f"{model} score :{round(cv['test_score'].mean(),2)}, time {round(cv['score_time'].mean(),4)}")
    return X, y, X_train, X_test, y_train, y_test

## On a 3 jeux de données déssinés differemments ; 
* horizontal (sur une tabe ), 
* vertical (sur un mur), 
* 3D (dans l'espace)

### Aquisition 'v'(vertical) en 2D contre un mur (1 groupe)

In [3]:
print ("\n--> Dataset Groupe5:")
to_test4 = load_data('data/v_config1-lcb',drop_col='t')
X, y, X_train, X_test, y_train, y_test = test_models(to_test4)


--> Dataset Groupe5:
LogisticRegression(solver='liblinear') score :0.93, time 0.0007
DecisionTreeClassifier() score :0.6, time 0.0003
RandomForestClassifier() score :0.92, time 0.0038


### Aquisition 'h' (horizontal) en 2D sur une table (2 groupes)
Le nombre de changement de signes de chacune des variables peut être calculé pour chacun des datasets

In [4]:
print ("\n--> Dataset Groupe3:")
to_test = load_data('data/group3/config_1',drop_col='t')
X, y, X_train, X_test, y_train, y_test = test_models(to_test)
to_test['groupe'] = '1'
print ("\n--> Dataset Groupe4:")
to_test2 = load_data('data/h_config1-lcb', drop_col='t')
X, y, X_train, X_test, y_train, y_test = test_models(to_test2)
to_test2['groupe'] = '2'
print ("\n--> Dataset combinés")
to_test3 = pd.concat([to_test,to_test2], axis=0)
X, y, X_train, X_test, y_train, y_test = test_models(to_test3.drop(columns='groupe'))



--> Dataset Groupe3:
LogisticRegression(solver='liblinear') score :0.87, time 0.0003
DecisionTreeClassifier() score :0.72, time 0.0003
RandomForestClassifier() score :0.95, time 0.0036

--> Dataset Groupe4:
LogisticRegression(solver='liblinear') score :0.88, time 0.0003
DecisionTreeClassifier() score :0.76, time 0.0003
RandomForestClassifier() score :0.94, time 0.0041

--> Dataset combinés
LogisticRegression(solver='liblinear') score :0.76, time 0.0008
DecisionTreeClassifier() score :0.68, time 0.0004
RandomForestClassifier() score :0.92, time 0.004


### Aquisition 'libre' en 3D dans toutes les directions (2 groupes)

In [9]:
print ("\n--> Dataset Groupe1:")
to_test5 = load_data('data/groupe1_groupe2',drop_col='t')
X, y, X_train, X_test, y_train, y_test = test_models(to_test5.loc[to_test5.groupe == '1'].drop(columns=['groupe']))
print ("\n--> Dataset Groupe2:")
to_test5 = load_data('data/groupe1_groupe2',drop_col='t')
X, y, X_train, X_test, y_train, y_test = test_models(to_test5.loc[to_test5.groupe == '2'].drop(columns=['groupe']))
print ("\n--> Dataset Groupe 1-2:")
to_test5 = load_data('data/groupe1_groupe2',drop_col='t')
X, y, X_train, X_test, y_train, y_test = test_models(to_test5.drop(columns=['groupe']))


--> Dataset Groupe1:
LogisticRegression(solver='liblinear') score :0.79, time 0.0002
DecisionTreeClassifier() score :0.47, time 0.0003
RandomForestClassifier() score :0.84, time 0.0036

--> Dataset Groupe2:
LogisticRegression(solver='liblinear') score :0.81, time 0.0003
DecisionTreeClassifier() score :0.54, time 0.0005
RandomForestClassifier() score :0.79, time 0.0039

--> Dataset Groupe 1-2:
LogisticRegression(solver='liblinear') score :0.56, time 0.0003
DecisionTreeClassifier() score :0.42, time 0.0003
RandomForestClassifier() score :0.69, time 0.004


* les score de datset de groupes pris individuellement sont toujours meilleurs que les les scores de datasest combinés

# Analyses des features 

### On peut regarder l'effet de chaque feature statistique (count,mean,std,...)

In [10]:
stats__ = to_test3.describe().index.tolist()
for _ in range(len(stats__)):
    stats_ = stats__.copy()
    removed_element = stats_[_]
    print(f"\n --> using only ** {removed_element} **")
    stats_.remove(removed_element)
    to_test5 = load_data('data/v_config1-lcb', drop_col='t',drop_feat=stats_)
    X, y, X_train, X_test, y_train, y_test = test_models(to_test5)




 --> using only ** count **
LogisticRegression(solver='liblinear') score :0.14, time 0.0003
DecisionTreeClassifier() score :0.16, time 0.0002
RandomForestClassifier() score :0.11, time 0.0038

 --> using only ** mean **
LogisticRegression(solver='liblinear') score :0.63, time 0.0002
DecisionTreeClassifier() score :0.56, time 0.0002
RandomForestClassifier() score :0.81, time 0.0038

 --> using only ** std **
LogisticRegression(solver='liblinear') score :0.63, time 0.0003
DecisionTreeClassifier() score :0.55, time 0.0003
RandomForestClassifier() score :0.77, time 0.0038

 --> using only ** min **
LogisticRegression(solver='liblinear') score :0.57, time 0.0002
DecisionTreeClassifier() score :0.59, time 0.0002
RandomForestClassifier() score :0.71, time 0.0039

 --> using only ** 25% **
LogisticRegression(solver='liblinear') score :0.57, time 0.0003
DecisionTreeClassifier() score :0.53, time 0.0002
RandomForestClassifier() score :0.73, time 0.0039

 --> using only ** 50% **
LogisticRegress

* la statistique COUNT ne permet pas de bonnes prédictions

### On peut ajouter des feautures supplémentaires et utiliser leurs 'means' (position / puissance)

In [11]:
stats__ = to_test3.describe().index.tolist()
stats__.remove('mean')

print("\njuste les means ")
to_test5 = load_data('data/v_config1-lcb', drop_col=['t'],drop_feat=stats__)
X, y, X_train, X_test, y_train, y_test = test_models(to_test5)

print("\nmeans + position")
to_test5 = load_data('data/v_config1-lcb', drop_col=['t'],drop_feat=stats__, position=True)
X, y, X_train, X_test, y_train, y_test = test_models(to_test5)

print("\nmeans + puissances")
to_test5 = load_data('data/v_config1-lcb', drop_col=['t'],drop_feat=stats__, puissances=True)
X, y, X_train, X_test, y_train, y_test = test_models(to_test5)

print("\nmeans + position + puissance")
to_test5 = load_data('data/v_config1-lcb', drop_col=['t'],drop_feat=stats__, position=True, puissances=True)
X, y, X_train, X_test, y_train, y_test = test_models(to_test5)



juste les means 
LogisticRegression(solver='liblinear') score :0.63, time 0.0002
DecisionTreeClassifier() score :0.53, time 0.0002
RandomForestClassifier() score :0.78, time 0.0037

means + position
LogisticRegression(solver='liblinear') score :0.74, time 0.0002
DecisionTreeClassifier() score :0.61, time 0.0002
RandomForestClassifier() score :0.84, time 0.0037

means + puissances
LogisticRegression(solver='liblinear') score :0.66, time 0.0002
DecisionTreeClassifier() score :0.59, time 0.0002
RandomForestClassifier() score :0.82, time 0.0038

means + chgmt sign + position + puissance
LogisticRegression(solver='liblinear') score :0.72, time 0.0002
DecisionTreeClassifier() score :0.65, time 0.0003
RandomForestClassifier() score :0.84, time 0.0038


* Faire du feature engineering permet d'augmenter la precision des predictions 
* les puissances n'aident pas à améliorer les scores de classification
### on peut aussi ajouter des features comme le nombre de changement de signes de chacune des variables

In [12]:

print("\nmeans + chgmt sign")
to_test5 = load_data('data/v_config1-lcb', drop_col=['t'],drop_feat=stats__, change_sign=True)
X, y, X_train, X_test, y_train, y_test = test_models(to_test5)

print("\nmeans + chgmt sign + position")
to_test5 = load_data('data/v_config1-lcb', drop_col=['t'],drop_feat=stats__, position=True, change_sign=True)
X, y, X_train, X_test, y_train, y_test = test_models(to_test5)



means + chgmt sign
LogisticRegression(solver='liblinear') score :0.68, time 0.0002
DecisionTreeClassifier() score :0.56, time 0.0002
RandomForestClassifier() score :0.82, time 0.0037

means + chgmt sign + position
LogisticRegression(solver='liblinear') score :0.78, time 0.0004
DecisionTreeClassifier() score :0.65, time 0.0002
RandomForestClassifier() score :0.84, time 0.0037


* l'ajout de la feauture changement de signe permet d'augmente encore la precision

### On teste alors l'effet de l'ajout de ces features en utilisant toutes les statistiques (count, mean, std, ...)

In [17]:
print("\nSans les variables supplémentaires (toutes les statistiques sauf count)")
to_test5 = load_data('data/v_config1-lcb', drop_col=['t'], drop_feat='count')
X, y, X_train, X_test, y_train, y_test = test_models(to_test5)

print("\nAvec chgmt sign + position (toutes les statistiques)")
to_test5 = load_data('data/v_config1-lcb', drop_col=['t'], drop_feat='count', position=True, change_sign=True)
X, y, X_train, X_test, y_train, y_test = test_models(to_test5)



Sans les variables supplémentaires (toutes les statistiques)
LogisticRegression(solver='liblinear') score :0.94, time 0.0003
DecisionTreeClassifier() score :0.64, time 0.0002
RandomForestClassifier() score :0.91, time 0.0037

Avec chgmt sign + position (toutes les statistiques)
LogisticRegression(solver='liblinear') score :0.95, time 0.0003
DecisionTreeClassifier() score :0.64, time 0.0003
RandomForestClassifier() score :0.94, time 0.0038


* L'ajout de ces nouvelles features (chgmt sign et position) combiné à toutes les statistiques augment drastiquement les scores de la regr logistique et du random forest classifier

### on teste sur les autres datasets

In [19]:
print ("\n--> Dataset 3D avec position et chgmt signe:")
to_test4 = load_data('data/groupe1_groupe2',drop_col='t', drop_feat='count', position=True, change_sign=True)
X, y, X_train, X_test, y_train, y_test = test_models(to_test4.drop(columns=['groupe']))

print ("\n--> Dataset horizontal avec position et chgmt signe:")
to_test = load_data('data/group3/config_1',drop_col='t', drop_feat='count', position=True, change_sign=True)
to_test2 = load_data('data/h_config1-lcb', drop_col='t', drop_feat='count', position=True, change_sign=True)
to_test3 = pd.concat([to_test,to_test2], axis=0)
X, y, X_train, X_test, y_train, y_test = test_models(to_test3)


--> Dataset 3D avec position et chgmt signe:
LogisticRegression(solver='liblinear') score :0.63, time 0.0003
DecisionTreeClassifier() score :0.42, time 0.0003
RandomForestClassifier() score :0.74, time 0.0038

--> Dataset horizontal avec position et chgmt signe:
LogisticRegression(solver='liblinear') score :0.79, time 0.0003
DecisionTreeClassifier() score :0.64, time 0.0003
RandomForestClassifier() score :0.93, time 0.0039


In [20]:
# --> Dataset Groupe 1-2 (3D)
# LinearRegression() score :0.06, time 0.0002
# LogisticRegression(solver='liblinear') score :0.56, time 0.0004
# DecisionTreeClassifier() score :0.44, time 0.0003
# RandomForestClassifier() score :0.73, time 0.0039

# --> Dataset combinés (horizontal)
# LinearRegression() score :-0.05, time 0.0002
# LogisticRegression(solver='liblinear') score :0.76, time 0.0003
# DecisionTreeClassifier() score :0.67, time 0.0004
# RandomForestClassifier() score :0.93, time 0.0042

Finalement, nous pouvons decouper la série temporelle en un nombre n de segments afin d'en extraire les statistiques comme precedemmend
# Stratification temporelle

In [21]:
feat2drop=['count', 'std', 'min', '25%', '50%', '75%', 'max']

print("\n1 segment de temps")
to_test_time1 = load_data(directory='data/h_config1-lcb', drop_col='t', drop_feat=feat2drop, position=False, puissances=False, change_sign=False,n_segments=1)
X, y, X_train, X_test, y_train, y_test = test_models(to_test_time1)
print("\n2 segment de temps")
to_test_time2 = load_data(directory='data/h_config1-lcb', drop_col='t', drop_feat=feat2drop, position=False, puissances=False, change_sign=False,n_segments=2)
X, y, X_train, X_test, y_train, y_test = test_models(to_test_time2)
print("\n3 segment de temps")
to_test_time3 = load_data(directory='data/h_config1-lcb', drop_col='t', drop_feat=feat2drop, position=False, puissances=False, change_sign=False,n_segments=3)
X, y, X_train, X_test, y_train, y_test = test_models(to_test_time3)
print("\n4 segment de temps")
to_test_time4 = load_data(directory='data/h_config1-lcb', drop_col='t', drop_feat=feat2drop, position=False, puissances=False, change_sign=False,n_segments=4)
X, y, X_train, X_test, y_train, y_test = test_models(to_test_time4)
print("\n8 segment de temps")
to_test_time8 = load_data(directory='data/h_config1-lcb', drop_col='t', drop_feat=feat2drop, position=False, puissances=False, change_sign=False,n_segments=8)
X, y, X_train, X_test, y_train, y_test = test_models(to_test_time8)


1 segment de temps
LogisticRegression(solver='liblinear') score :0.48, time 0.0002
DecisionTreeClassifier() score :0.58, time 0.0002
RandomForestClassifier() score :0.76, time 0.0039

2 segment de temps
LogisticRegression(solver='liblinear') score :0.7, time 0.0003
DecisionTreeClassifier() score :0.76, time 0.0003
RandomForestClassifier() score :0.86, time 0.0038

3 segment de temps
LogisticRegression(solver='liblinear') score :0.87, time 0.0003
DecisionTreeClassifier() score :0.78, time 0.0003
RandomForestClassifier() score :0.96, time 0.0039

4 segment de temps
LogisticRegression(solver='liblinear') score :0.84, time 0.0002
DecisionTreeClassifier() score :0.84, time 0.0002
RandomForestClassifier() score :0.93, time 0.0036

8 segment de temps
LogisticRegression(solver='liblinear') score :0.83, time 0.0003
DecisionTreeClassifier() score :0.76, time 0.0002
RandomForestClassifier() score :0.93, time 0.0036


* On atteind un maximum de precision avec 3 segments de temps
# Toutes les stats descriptives + 2 features engineered + 3 segments de temps

In [22]:
print("\n3 segment de temps et toutes les meilleur features (1 dataset vertical)")
to_test_final = load_data(directory='data/v_config1-lcb', drop_col='t', drop_feat='count', position=True, puissances=False, change_sign=True,n_segments=3)
X, y, X_train, X_test, y_train, y_test = test_models(to_test_final)


3 segment de temps et toutes les meilleur features (1 dataset vertical)
LogisticRegression(solver='liblinear') score :0.97, time 0.0014
DecisionTreeClassifier() score :0.71, time 0.0003
RandomForestClassifier() score :0.95, time 0.0036


In [24]:
print ("\n--> Dataset 3D (2 datasets) avec position et chgmt signe:")
to_test4 = load_data('data/groupe1_groupe2', drop_col='t', drop_feat='count', position=True, puissances=False, change_sign=True,n_segments=3)
X, y, X_train, X_test, y_train, y_test = test_models(to_test4.drop(columns=['groupe']))

print ("\n--> Dataset horizontal (2 datasets) avec position et chgmt signe:")
to_test = load_data('data/group3/config_1', drop_col='t', drop_feat='count', position=True, puissances=False, change_sign=True,n_segments=3)
to_test2 = load_data('data/h_config1-lcb', drop_col='t', drop_feat='count', position=True, puissances=False, change_sign=True,n_segments=3)
to_test3 = pd.concat([to_test,to_test2], axis=0)
X, y, X_train, X_test, y_train, y_test = test_models(to_test3)


--> Dataset 3D (2 datasets) avec position et chgmt signe:
LogisticRegression(solver='liblinear') score :0.73, time 0.0009
DecisionTreeClassifier() score :0.36, time 0.0004
RandomForestClassifier() score :0.76, time 0.0039

--> Dataset horizontal (2 datasets) avec position et chgmt signe:
LogisticRegression(solver='liblinear') score :0.89, time 0.001
DecisionTreeClassifier() score :0.67, time 0.0004
RandomForestClassifier() score :0.93, time 0.004


In [25]:
print("Tous les datsets 2D (3 datasets)")
to_test4 = pd.concat([to_test_final,to_test,to_test2], axis=0)
X, y, X_train, X_test, y_train, y_test = test_models(to_test4)


Tous les datsets 2D (3 datasets)
LogisticRegression(solver='liblinear') score :0.92, time 0.0009
DecisionTreeClassifier() score :0.62, time 0.0004
RandomForestClassifier() score :0.96, time 0.0044
