In [None]:
# Data extraction tools
import requests
from bs4 import BeautifulSoup

#Data wrangling tools
import pandas as pd
import numpy as np

#DataBase tools
import psycopg2
import psycopg2.extras as extras

# Data Extraction

In [None]:
year_html = None #########

In [None]:
def soup(year):
    
    url = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
    
    print(f"Asking for http content from {url}")
    response = requests.get(url).text
    soup = BeautifulSoup(response, "html.parser")
    
    print(f"Filtering by year provided: {year}...")
    global year_html
    year_html = soup.find_all("div", id=f"faq{year}")[0] ##############
    print(f"html generated")

In [None]:
def single_month_links_df(year, month):
    
    months_available = [elem.get_text(strip=True) for elem in year_html.find_all(['b'])]
    
    print(f"Checking month requested {month} vs months available in the html content...")
    if month not in months_available:
        raise ValueError(f"month provided does not match with available months, check spelling {months_available}")
    
    print(f"Month provided is valid. Filtering by month...")
    filter_by_month = year_html.find(string=f"{month}").parent.findNext('ul')
    links = [elem.a['href'] for elem in filter_by_month.find_all('li')]
    description = [elem.a.text for elem in filter_by_month.find_all('li')]
    
    single_month_links = pd.DataFrame({'month': month, 'links': links, 'description': description})
    
    pd.set_option('display.max_colwidth', None)
    
    return single_month_links

In [None]:
def all_months_links_df(year, *month):
    
    links = pd.DataFrame()
    
    for elem in month:
        links = links.append(single_month_links_df(year, elem), ignore_index=True)
 
    print("Generating csv links table")
    
    return links

# Data Wrangling

In [None]:
def time_lapse(seconds):
    seconds_in_day = 60 * 60 * 24
    seconds_in_hour = 60 * 60
    seconds_in_minute = 60
    
    days = seconds // seconds_in_day
    hours = (seconds - (days * seconds_in_day)) // seconds_in_hour
    minutes = (seconds - (days * seconds_in_day) - (hours * seconds_in_hour)) // seconds_in_minute
    
    return f"{days} days, {hours} hours, {minutes} minutes"

In [None]:
def csv_to_df(year, taxi_color ,*month):
    
    soup(year) ###
    
    print(f"list of months requested: {month}")
    links_table = all_months_links_df(year, *month)
    links_table = links_table[links_table['description'].str.contains(taxi_color)][['month','links']]
    print(links_table['links'])
    
    final_df = pd.DataFrame()
    
    print("Starting parsing process:")
    for index, elem in enumerate(links_table['links']):
        print(f"Parsing csv {elem}")
        #m =  month[index]
        df = pd.read_csv(elem)
        df.insert(1, 'month', month[index])
        #df['month'] = m
        final_df = final_df.append(df, ignore_index=True)
        print("Parsing completed")
    
    type_object = final_df.select_dtypes(include='object').columns.to_list()
    type_object.remove('store_and_fwd_flag')
    type_object.remove('month')
    
    for elem in type_object:
        final_df[f"{elem}"] = pd.to_datetime(final_df[f"{elem}"], yearfirst=True, format="%Y/%m/%d %H:%M:%S")
    
    final_df['month'] = pd.to_datetime(final_df['month'], format="%B").dt.month  
    
    trip_duration_seconds = (final_df['tpep_dropoff_datetime'] - final_df['tpep_pickup_datetime']).astype('timedelta64[s]').astype('int')
    trip_duration = trip_duration_seconds.apply(time_lapse)
    
    final_df.insert(4, 'trip_duration_seconds', trip_duration_seconds)
    final_df.insert(5, 'trip_duration', trip_duration)
    
    return final_df

In [None]:
yellowtaxis_2020 = csv_to_df(2020, 'Yellow', 'January', 'February', 'March')
name = "2020_yellow_jan_feb_mar.csv"
yellowtaxis_2020.to_csv(name, index=False)

# DataBase creation

In [None]:
#establishing the connection
conn = psycopg2.connect(
   database="postgres", user='snowman', password='snowball', host='127.0.0.1', port= '5432'
)

conn.autocommit = True

#Creating a cursor object using the cursor() method
cursor = conn.cursor()

#Preparing query to create a database
sql = '''CREATE database taxis''';

#Creating a database
cursor.execute(sql)
print("Database created successfully !")

#Closing the connection
conn.close()

# DataBase conection

In [None]:
param_dic = {
    "host"      : "127.0.0.1",
    "database"  : "taxis",
    "user"      : "snowman",
    "password"  : "snowball"
}

In [None]:
def connect(params_dic):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print('Connection succesful')
    
    return conn

# Creating Table in the DataBase

In [None]:
def creating_table(table_name: str):
    
    conn = connect(param_dic)
    cursor = conn.cursor()
    
    sql =f'''CREATE TABLE {table_name}(
       VendorID float,
       month int,
       tpep_pickup_datetime timestamp,
       tpep_dropoff_datetime timestamp,
       trip_duration_seconds int,
       trip_duration varchar,
       passenger_count float,
       trip_distance float,
       RatecodeID float,
       store_and_fwd_flag varchar,
       PULocationID float,
       DOLocationID float,
       payment_type float,
       fare_amount float,
       extra float,
       mta_tax float,
       tip_amount float,
       tolls_amount float,
       improvement_surcharge float,
       total_amount float,
       congestion_surcharge float
    );'''
    
    print(f"Creating table {table_name}")
    cursor.execute(sql)
    print("Table created")
    conn.commit()
    #cursor.close()
    #print("Cursor object closed")
    conn.rollback()
    #print(f"Closing {conn.info.dbname} database conection")
    #conn.close()
    #print("Conection closed.")

# DataBase insertion

In [24]:
def execute_batch(table: str, year: int, taxi_color: str, *month, page_size=100,):
    
    '''
    
    '''

    df = csv_to_df(year, taxi_color, *month)
    
    print("Converting DataFrame rows into tuples")
    tuples = [tuple(x) for x in df.to_numpy()]
    # Comma-separated dataframe columns
    cols = ','.join(list(df.columns))
    
    # creating_table(table)
    
    # SQL quert to execute
    query  = "INSERT INTO %s(%s) VALUES(%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s)" % (table, cols)
    conn = connect(param_dic)
    cursor = conn.cursor()
    try:
        print(f"Inserting values into the {table} table at {conn.info.dbname} database. Please wait, this may take a few minutes...")
        extras.execute_batch(cursor, query, tuples, page_size)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    print(f"Batch executed succesfully, all dataframe values inserted into the {table} table at {cursor.connection}")
    
    print("Closing cursor object...")
    cursor.close()
    print("Cursor object closed.")
    print(f"Closing {conn.info.dbname} database conection...")
    conn.close()
    print(f"{conn.info.dbname} database conection closed.")
      

In [25]:
execute_batch('yellowtaxis', 2020, 'Yellow', 'January', 'February', 'March')

Asking for http content from https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page
Filtering by year provided: 2020...
html generated
list of months requested: ('January', 'February', 'March')
Checking month requested January vs months available in the html content...
Month provided is valid. Filtering by month...
Checking month requested February vs months available in the html content...
Month provided is valid. Filtering by month...
Checking month requested March vs months available in the html content...
Month provided is valid. Filtering by month...
Generating csv links table
0    https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-01.csv
4    https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-02.csv
8    https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-03.csv
Name: links, dtype: object
Starting parsing process:
Parsing csv https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-01.csv


  df = csv_to_df(year, taxi_color, *month)


Parsing completed
Parsing csv https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-02.csv
Parsing completed
Parsing csv https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-03.csv
Parsing completed
Converting DataFrame rows into tuples
Connecting to the PostgreSQL database...
Connection succesful
Inserting values into the yellowtaxis table at taxis database. Please wait, this may take a few minutes...
Batch executed succesfully, all dataframe values inserted into the yellowtaxis table at <connection object at 0x7ff56e494580; dsn: 'user=snowman password=xxx dbname=taxis host=127.0.0.1', closed: 0>
Closing cursor object...
Cursor object closed.
Closing taxis database conection...
None database conection closed.
