In [1]:
# Data extraction tools
import requests
from bs4 import BeautifulSoup

#Data wrangling tools
import pandas as pd
import numpy as np

#DataBase tools
import psycopg2
import psycopg2.extras as extras

# Data Extraction

In [2]:
year_html = None #########

In [3]:
def soup(year):
    
    url = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
    
    print(f"Asking for http content from {url}")
    response = requests.get(url).text
    soup = BeautifulSoup(response, "html.parser")
    
    print(f"Filtering by year provided: {year}...")
    global year_html
    year_html = soup.find_all("div", id=f"faq{year}")[0] ##############
    print(f"html generated")

In [4]:
def single_month_links_df(year, month):
    
    months_available = [elem.get_text(strip=True) for elem in year_html.find_all(['b'])]
    
    print(f"Checking month requested {month} vs months available in the html content...")
    if month not in months_available:
        raise ValueError(f"month provided does not match with available months, check spelling {months_available}")
    
    print(f"Month provided is valid. Filtering by month...")
    filter_by_month = year_html.find(string=f"{month}").parent.findNext('ul')
    links = [elem.a['href'] for elem in filter_by_month.find_all('li')]
    description = [elem.a.text for elem in filter_by_month.find_all('li')]
    
    single_month_links = pd.DataFrame({'month': month, 'links': links, 'description': description})
    
    pd.set_option('display.max_colwidth', None)
    
    return single_month_links

In [5]:
def all_months_links_df(year, *month):
    
    links = pd.DataFrame()
    
    for elem in month:
        links = links.append(single_month_links_df(year, elem), ignore_index=True)
 
    print("Generating csv links table")
    
    return links

# Data Wrangling

def csv_to_df(year, taxi_color ,month):
    
    soup(year) ###
        
    print(f"Month requested: {month}")
    link = single_month_links_df(year, month)

    link = link[link['description'].str.contains(taxi_color)]['links'][0]
    print(f"Reading csv {link}")
    
    month_df = pd.read_csv(link)
    type_object = month_df.select_dtypes(include='object').columns.to_list()
    type_object.pop()
    for typ in type_object:
        month_df[f"{typ}"] = pd.to_datetime(month_df[f"{typ}"], yearfirst=True, format="%Y/%m/%d")
    
    return month_df

In [6]:
def csv_to_df(year, taxi_color ,*month):
    
    soup(year) ###
    
    print(f"list of months requested: {month}")
    links_table = all_months_links_df(year, *month)
    links_table = links_table[links_table['description'].str.contains(taxi_color)][['month','links']]
    print(links_table)
    
    final_df = pd.DataFrame()
    
    print("Starting parsing process:")
    for index, elem in enumerate(links_table['links']):
        print(f"Parsing csv {elem}")
        #m =  month[index]
        df = pd.read_csv(elem)
        df.insert(1, 'month', month[index])
        #df['month'] = m
        final_df = final_df.append(df, ignore_index=True)
        print("Parsing completed")
    
    type_object = final_df.select_dtypes(include='object').columns.to_list()
    type_object.remove('store_and_fwd_flag')
    type_object.remove('month')
    
    for elem in type_object:
        final_df[f"{elem}"] = pd.to_datetime(final_df[f"{elem}"], yearfirst=True, format="%Y/%m/%d %H:%M:%S")
    
    
    return final_df

In [7]:
yellowtaxis_2020= csv_to_df(2020, 'Yellow', 'January', 'February', 'March')
yellowtaxis_2020

Asking for http content from https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page
Filtering by year provided: 2020...
html generated
list of months requested: ('January', 'February', 'March')
Checking month requested January vs months available in the html content...
Month provided is valid. Filtering by month...
Checking month requested February vs months available in the html content...
Month provided is valid. Filtering by month...
Checking month requested March vs months available in the html content...
Month provided is valid. Filtering by month...
Generating csv links table
      month  \
0   January   
4  February   
8     March   

                                                                    links  
0  https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-01.csv  
4  https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-02.csv  
8  https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-03.csv  
Starting parsing process:
Parsing csv h

  if (await self.run_code(code, result,  async_=asy)):


Parsing completed
Parsing csv https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-02.csv
Parsing completed
Parsing csv https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-03.csv
Parsing completed


Unnamed: 0,VendorID,month,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1.0,January,2020-01-01 00:28:15,2020-01-01 00:33:03,1.0,1.20,1.0,N,238,239,1.0,6.00,3.0,0.5,1.47,0.0,0.3,11.27,2.5
1,1.0,January,2020-01-01 00:35:39,2020-01-01 00:43:04,1.0,1.20,1.0,N,239,238,1.0,7.00,3.0,0.5,1.50,0.0,0.3,12.30,2.5
2,1.0,January,2020-01-01 00:47:41,2020-01-01 00:53:52,1.0,0.60,1.0,N,238,238,1.0,6.00,3.0,0.5,1.00,0.0,0.3,10.80,2.5
3,1.0,January,2020-01-01 00:55:23,2020-01-01 01:00:14,1.0,0.80,1.0,N,238,151,1.0,5.50,0.5,0.5,1.36,0.0,0.3,8.16,0.0
4,2.0,January,2020-01-01 00:01:58,2020-01-01 00:04:16,1.0,0.00,1.0,N,193,193,2.0,3.50,0.5,0.5,0.00,0.0,0.3,4.80,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15711649,,March,2020-03-31 23:21:00,2020-03-31 23:33:00,,5.43,,,137,87,,30.47,0.0,0.5,0.00,0.0,0.3,33.77,2.5
15711650,,March,2020-03-31 23:57:00,2020-04-01 00:26:00,,13.21,,,137,71,,37.97,0.0,0.5,0.00,0.0,0.3,41.27,2.5
15711651,,March,2020-03-31 23:22:01,2020-03-31 23:43:52,,12.14,,,137,32,,37.10,0.0,0.0,0.00,0.0,0.3,39.90,2.5
15711652,,March,2020-03-31 23:18:53,2020-03-31 23:32:21,,7.01,,,137,159,,20.07,0.0,0.0,0.00,0.0,0.3,22.87,2.5


In [10]:
diff = yellowtaxis_2020['tpep_dropoff_datetime'] - yellowtaxis_2020['tpep_pickup_datetime']
yellowtaxis_2020.insert(3, 'time_lapse', diff)

In [13]:
yellowtaxis_2020[['time_lapse']].sort_values(by=['time_lapse'], ascending=False)

Unnamed: 0,time_lapse
4049543,5 days 22:05:07
4704348,3 days 20:29:55
5800493,3 days 14:11:05
5812471,3 days 14:10:19
944735,2 days 21:28:00
...,...
15685698,-1 days +23:59:07
540040,-2 days +01:49:38
7343280,-2 days +00:19:02
8497823,-7 days +01:29:20


# DataBase creation

In [None]:
#establishing the connection
conn = psycopg2.connect(
   database="postgres", user='postgres', password='Snowdav3', host='127.0.0.1', port= '5432'
)

conn.autocommit = True

#Creating a cursor object using the cursor() method
cursor = conn.cursor()

#Preparing query to create a database
sql = '''CREATE database taxis''';

#Creating a database
cursor.execute(sql)
print("Database created successfully........")

#Closing the connection
conn.close()

# DataBase conection

In [None]:
param_dic = {
    "host"      : "127.0.0.1",
    "database"  : "taxis",
    "user"      : "postgres",
    "password"  : "Snowdav3"
}

In [None]:
def connect(params_dic):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print('Connection succesful')
    
    return conn

# Creating Table in the DataBase

In [None]:
def creating_table(table_name: str):
    
    conn = connect(param_dic)
    cursor = conn.cursor()
    
    sql =f'''CREATE TABLE {table_name}(
       VendorID float,
       month varchar,
       tpep_pickup_datetime timestamp,
       tpep_dropoff_datetime timestamp,
       passenger_count float,
       trip_distance float,
       RatecodeID float,
       store_and_fwd_flag varchar,
       PULocationID float,
       DOLocationID float,
       payment_type float,
       fare_amount float,
       extra float,
       mta_tax float,
       tip_amount float,
       tolls_amount float,
       improvement_surcharge float,
       total_amount float,
       congestion_surcharge float
    );'''
    
    cursor.execute(sql)
    print("Table created")
    conn.commit()
    cursor.close()
    print("Cursor object closed")
    conn.rollback()
    conn.close()
    print(f"{conn.info.dbname} database conection closed")

In [None]:
creating_table('yellowtaxis')

# DataBase insertion

In [None]:
def execute_batch(table: str, year: int, taxi_color: str, *month, page_size=100,):
    
    '''
    
    '''

    df = csv_to_df(year, taxi_color, *month)
    
    print("Converting DataFrame rows into tuples")
    tuples = [tuple(x) for x in df.to_numpy()]
    # Comma-separated dataframe columns
    cols = ','.join(list(df.columns))
    # SQL quert to execute
    query  = "INSERT INTO %s(%s) VALUES(%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s)" % (table, cols)
    conn = connect(param_dic)
    cursor = conn.cursor()
    try:
        print(f"Inserting values into the {table} table at {conn.info.dbname} database. Please wait, this may take a few minutes...")
        extras.execute_batch(cursor, query, tuples, page_size)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    print(f"Batch executed succesfully, all dataframe values inserted into the {table} table at {cursor.connection}")
    
    cursor.close()
    print("Cursor object closed")
    conn.close()
    print(f"Conection {conn} closed")
      

In [None]:
execute_batch('yellowtaxis', 2020, 'Yellow', 'January', 'February', 'March')