In [150]:
# Installing pandas
# https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/

import sys
!{sys.executable} -m pip install pandas



In [195]:
import pandas as pd

# Importing CSVs

tickets_df = pd.read_csv('./csv/ticket_data.csv', comment='#')
cities_df = pd.read_csv('./csv/cities.csv', comment='#')
providers_df = pd.read_csv('./csv/providers.csv', comment='#')
stations_df = pd.read_csv('./csv/stations.csv', comment='#')

In [196]:
# Testing pandas
# Using https://pandas.pydata.org/pandas-docs/stable/user_guide/


# Function returning a dataframe with all the trips between these cities

def getTrips(tickets_df, origin_city = 628, destination_city = 453):
    
    # Selecting the trips between origin and destination
    
    selected_trips_df = tickets_df.loc[(tickets_df['o_city'] == origin_city) & (tickets_df['d_city'] == destination_city)]
    
    return selected_trips_df;

In [207]:
# Getting the minimum and the maximum price of the selected trip and the mean of the prices

def getPricesBounds(trips_df):
    minPrice = (trips_df['price_in_cents'].min()) / 100
    avgPrice = (trips_df['price_in_cents'].mean()) / 100
    maxPrice = (trips_df['price_in_cents'].max()) / 100
    
    return minPrice, avgPrice, maxPrice;

In [154]:
# Installing datetime

!{sys.executable} -m pip install datetime



In [250]:
import datetime

# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
# https://pandas.pydata.org/pandas-docs/stable/user_guide/timedeltas.html

# Adds a duration column and format the timestamps to datetime

def addDuration(trips_df):
    
    # Reformatting the columns representing timestamps
    formatted_trips_df = trips_df.loc[:, ['departure_ts', 'arrival_ts']].apply(pd.to_datetime)
    
    # Maybe add later the formatted columns back into the df
    
    # Create a Serie containing the durations
    duration_s = formatted_trips_df.loc[:, 'arrival_ts'] - formatted_trips_df.loc[:, 'departure_ts']
    duration_s.name = 'duration_tdelt'
    
    # Concatenate the durations to the dataframe
    trips_df = pd.concat([trips_df, duration_s], axis = 1)
    
    return trips_df;

In [247]:
def getDurationBounds(trips_df):
    
    minDuration = min(trips_df.loc['duration_tdelt'])
    #meanDuration = trips_df['duration_ts'].mean()
    #maxDuration = trips_df['duration_ts'].max()
    
    #print(minDuration)
    
    return minDuration;#, meanDuration, maxDuration;

In [256]:
# Select a trip from a given city to another
myTrip = getTrips(tickets_df, 628, 453)

# Get price overview
minPrice, avgPrice, maxPrice = getPricesBounds(myTrip)
print("For the trip from town A to town B, the prices are going from {:.2f}€ to {:.2f}€ and the average price is {:.2f}€".format(minPrice, maxPrice, avgPrice))

# Add a duration column for each trip
myTrip = addDuration(myTrip)

print(myTrip.dtypes)

# Get duration overview
#minDuration, avgDuration, maxDuration = getDurationBounds(myTrip)

For the trip from town A to town B, the prices are going from 10.00€ to 134.50€ and the average price is 20.31€
id                           int64
company                      int64
o_station                  float64
d_station                  float64
departure_ts                object
arrival_ts                  object
price_in_cents               int64
search_ts                   object
middle_stations             object
other_companies             object
o_city                       int64
d_city                       int64
duration_tdelt     timedelta64[ns]
dtype: object
