In [113]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Extraction et normalisation des données

on charge les données et on les met en forme

In [114]:

def variable_target(df):
    return df["trip_duration"]


def variable_predictive(df):
    variables = ["day", "hour", "PULocationID", "DOLocationID"]
    return df[variables]


def normalise(df):
    scale = StandardScaler()
    vp = variable_predictive(df)
    columns_to_scale = ["day", "hour", "PULocationID", "DOLocationID"]

    for column in columns_to_scale:
        vp[column] = pd.to_numeric(vp[column], errors='coerce')

    df_scaled = pd.DataFrame(scale.fit_transform(vp[columns_to_scale]), columns=columns_to_scale)
    return df_scaled


In [115]:
path = "Sources/result.csv"

df = pd.read_csv(path)

print(df.head())

      day  hour  PULocationID  DOLocationID  trip_duration
0  Friday     0           259            32          458.0
1  Friday     5           259            32          622.0
2  Friday     6           259            32         1028.0
3  Friday     7           259            32         1247.0
4  Friday     8           259            32         1079.0


On peut ici voir la forme de nos données avec 5 colonnes

On sépare ensuite nos données en 2 parties : une d'apprentissage et une de test

In [116]:
from sklearn.model_selection import train_test_split

def split_x_y(df):
    X = df.drop(['trip_duration'], axis=1)

    y = df['trip_duration']
    return train_test_split(X, y, test_size=0.33, random_state=42)

In [117]:
X_train, X_test, y_train, y_test = split_x_y(df)

print(X_train.shape, X_test.shape)
print(X_train.dtypes)

(5743, 4) (2829, 4)
day             object
hour             int64
PULocationID     int64
DOLocationID     int64
dtype: object


On peut voir les dimensions de nos 2 datasets

In [118]:
import category_encoders as ce

def encoding(X_train, X_test):
    encoder = ce.OrdinalEncoder(cols=['day', 'hour', 'PULocationID', 'DOLocationID'])
    X_train = encoder.fit_transform(X_train)
    X_test = encoder.transform(X_test)
    return X_train, X_test

In [119]:
X_train, X_test = encoding(X_train, X_test)
print(X_train.head())

      day  hour  PULocationID  DOLocationID
808     1     1             1             1
6113    2     2             1             1
958     3     3             1             1
3555    4     4             1             1
4013    5     5             1             1


On encode ensuite les valeurs pour avoir uniquement des entiers

In [120]:
from sklearn.ensemble import RandomForestClassifier

def predict(X_train, y_train, X_test, y_test, tolerance_seconds):
    rfc = RandomForestClassifier(random_state=0)
    rfc.fit(X_train, y_train)
    y_pred = rfc.predict(X_test)
    return custom_error_metric(y_test, y_pred, tolerance_seconds)


def custom_error_metric(y_true, y_pred, tolerance_seconds):
    errors = abs(y_true - y_pred)
    correct_predictions = errors <= tolerance_seconds
    accuracy = sum(correct_predictions) / len(correct_predictions)
    return accuracy


tolerance_seconds = 60

accuracy_with_tolerance = predict(X_train, y_train, X_test, y_test, tolerance_seconds)

print(f'Model accuracy with tolerance: {accuracy_with_tolerance:.4f}')

Model accuracy with tolerance: 0.2471


On effectue un premier test avec un résultat à plus ou moins 60 secondes

In [121]:
def predict_from_path(path, tolerance_seconds):
    df = pd.read_csv(path)
    X_train, X_test, y_train, y_test = split_x_y(df)
    X_train, X_test = encoding(X_train, X_test)
    return predict(X_train, y_train, X_test, y_test, tolerance_seconds)


tolerance_seconds = 120
accuracy_with_tolerance = predict_from_path(path, tolerance_seconds)
print(f'Model accuracy with tolerance: {accuracy_with_tolerance:.4f}')

Model accuracy with tolerance: 0.4662


On effectue un second test avec un résultat à plus ou moins 2 minutes

In [122]:
tolerance_seconds = 240
accuracy_with_tolerance = predict_from_path(path, tolerance_seconds)
print(f'Model accuracy with tolerance: {accuracy_with_tolerance:.4f}')

Model accuracy with tolerance: 0.7416


On effectue un troisième test avec un résultat à plus ou moins 4 minutes

In [123]:
def predictResult(X_train, y_train, X_test):
    rfc = RandomForestClassifier(random_state=0)
    rfc.fit(X_train, y_train)
    y_pred = rfc.predict(X_test)
    return y_pred


dico = {'day': [1], 'hour': [3], 'PULocationID': [1], 'DOLocationID': [1]}
dfTest = pd.DataFrame(dico)
print(f"{predictResult(X_train, y_train, dfTest)[0]/60} minutes pour arriver à destination")

9.616666666666667 minutes pour arriver à destination


Ici, on peut tester des valeurs et observer le temps prédit par l'algorithme