In [None]:
import numpy as np
import pandas as pd

import os
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import sys

%matplotlib inline

# Data Loading

In [None]:
#chargement des datasets
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

# Data Exploration

In [None]:
train.head()

In [None]:
train.info()

In [None]:
test.head()

In [None]:
test.info()

In [None]:
#la comparaison des head() des deux datasets révèle qu'ils ne possèdent pas le même nombres de colonnes

diff_col = list(set(train.columns).difference(set(test.columns)))
diff_col

# Data preprocessing (outliers handling, etc.)

In [None]:
#vérification de l'existence de valeurs nulles
train.isna().sum()

In [None]:
#vérification de l'existence de duplicata
train.duplicated().sum()

In [None]:
#première visualisation de la répartition des outliers
plt.subplots(figsize=(18,7))
plt.title("Répartition des outliers")
train.boxplot()

In [None]:
#répresentation des outliers au niveau des coordonnées type 'pickup'
fig, ax = plt.subplots(ncols=1, nrows=1,figsize=(10,10))
plt.ylim(40.63, 40.85)
plt.xlim(-74.03, -73.77)
ax.scatter(train['pickup_longitude'],train['pickup_latitude'], s=0.0002, color='black', alpha=1)

In [None]:
#au vue des outliers révélés, on ne prends qu'en dessous d'une certaine durée de voyage
train = train.loc[train['trip_duration']< 1500000]

In [None]:
train.plot.scatter(x='pickup_longitude',y='pickup_latitude')

In [None]:
train = train.loc[train['pickup_longitude']> -85]
train = train.loc[train['pickup_latitude']< 46]

In [None]:
train.plot.scatter(x='dropoff_longitude',y='dropoff_latitude')

In [None]:
train = train.loc[train['dropoff_longitude']> -80]
train = train.loc[train['dropoff_latitude']> 36]

In [None]:
#rajout d'une colonne 'hour' et 'dist' contenant respectivement l'heure à laquelle le voyageur est récupérée
#et la distance parcourue durant ce voyage

train = train[(train['trip_duration'] > 60) & (train['trip_duration'] < 3600 * 24)]

train['hour'] = train['pickup_datetime'].apply(lambda x: int(x.split()[1][0:2]))

train['dist'] = np.sqrt((train['pickup_latitude']-train['dropoff_latitude'])**2
                        + (train['pickup_longitude']-train['dropoff_longitude'])**2)

In [None]:
train.loc[train.trip_duration<5000,"trip_duration"].hist(bins=120)

In [None]:
train = train[train['trip_duration']<= 4000]

# Features engeering

In [None]:
y = train["trip_duration"] # <-- target
X = train[["passenger_count","pickup_longitude", "pickup_latitude","dropoff_longitude","dropoff_latitude","hour","dist","vendor_id"]] # <-- features

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score

In [None]:
#pré-training 

# X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size=0.2, random_state=42)
# X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

In [None]:
#Choix du modèle
from sklearn.ensemble import RandomForestRegressor

# Training

In [None]:
#RandomForestRegressor tout indiqué pour un cas de regression
m1 = RandomForestRegressor(n_estimators=20, random_state=42)
m1.fit(X, y)

In [None]:
#génération du nombre de découpe pour le cross_validation
from sklearn.model_selection import ShuffleSplit

shuff = ShuffleSplit(n_splits=4, test_size=0.8, random_state=42)

In [None]:
#cross-validation pour la méthode de validation -> méthode stable
m1_scores = cross_val_score(m1, X, y, cv=shuff, scoring ="neg_mean_squared_log_error")

In [None]:
#obtention du score sous la forme du RMSE
for i in range(len(m1_scores)):
    m1_scores[i] = np.sqrt(abs(m1_scores[i])) #abs -> seulement les valeurs positives à cause du scoring "neg_mean..."
np.mean(m1_scores)

# Predicitions

In [None]:
test.head()

In [None]:
#Rajout des colonnes 'hour' et 'dist' pour pouvoir predict dessus
test['hour'] = test['pickup_datetime'].apply(lambda x: int(x.split()[1][0:2]))

test['dist'] = np.sqrt((test['pickup_latitude']-test['dropoff_latitude'])**2
                        + (test['pickup_longitude']-test['dropoff_longitude'])**2)

In [None]:
#stockage des predictions
X_test = test[["passenger_count","pickup_longitude", "pickup_latitude","dropoff_longitude","dropoff_latitude","hour","dist","vendor_id"]]
prediction = m1.predict(X_test)
prediction

## Submission

In [None]:
submit = pd.read_csv('../input/sample_submission.csv')

In [None]:
submit.head()

In [None]:
#chargement du fichier de soumission
submit_file = pd.DataFrame({'id':test.id, 'trip_duration':prediction})
print(submit_file)

In [None]:
submit_file.to_csv('submission.csv', index=False)