In [None]:
# Data extraction tools
import requests
from bs4 import BeautifulSoup

#Data wrangling tools
import pandas as pd
import numpy as np

#DataBase tools
import psycopg2
import psycopg2.extras as extras

# Data Extraction

In [None]:
year_html = None #########

In [None]:
def soup(year):
    
    url = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
    
    print(f"Asking for http content from {url}")
    response = requests.get(url).text
    soup = BeautifulSoup(response, "html.parser")
    
    print(f"Filtering by year provided: {year}...")
    global year_html
    year_html = soup.find_all("div", id=f"faq{year}")[0] ##############
    print(f"html generated")

In [None]:
def single_month_links_df(year, month):
    
    months_available = [elem.get_text(strip=True) for elem in year_html.find_all(['b'])]
    
    print(f"Checking month requested {month} vs months available in the html content...")
    if month not in months_available:
        raise ValueError(f"month provided does not match with available months, check spelling {months_available}")
    
    print(f"Month provided is valid. Filtering by month...")
    filter_by_month = year_html.find(string=f"{month}").parent.findNext('ul')
    links = [elem.a['href'] for elem in filter_by_month.find_all('li')]
    description = [elem.a.text for elem in filter_by_month.find_all('li')]
    
    single_month_links = pd.DataFrame({'month': month, 'links': links, 'description': description})
    
    pd.set_option('display.max_colwidth', None)
    
    return single_month_links

In [None]:
def all_months_links_df(year, *month):
    
    links = pd.DataFrame()
    
    for elem in month:
        links = links.append(single_month_links_df(year, elem), ignore_index=True)
 
    print("Generating csv links table")
    
    return links

# Data Wrangling

def csv_to_df(year, taxi_color ,month):
    
    soup(year) ###
        
    print(f"Month requested: {month}")
    link = single_month_links_df(year, month)

    link = link[link['description'].str.contains(taxi_color)]['links'][0]
    print(f"Reading csv {link}")
    
    month_df = pd.read_csv(link)
    type_object = month_df.select_dtypes(include='object').columns.to_list()
    type_object.pop()
    for typ in type_object:
        month_df[f"{typ}"] = pd.to_datetime(month_df[f"{typ}"], yearfirst=True, format="%Y/%m/%d")
    
    return month_df

In [None]:
def csv_to_df(year, taxi_color ,*month):
    
    soup(year) ###
    
    print(f"list of months requested: {month}")
    links_table = create_links_table(year, *month)
    links_table = links_table[links_table['description'].str.contains(taxi_color)]['links']
    print(links_table)
    
    final_df = pd.DataFrame()
    
    for elem in links_table:
        print(f"Reading csv {elem}")
        final_df = final_df.append(pd.read_csv(elem), ignore_index=True)
        print("Completed")
    

    type_object = final_df.select_dtypes(include='object').columns.to_list()
    type_object.pop()
    
    for elem in type_object:
        final_df[f"{elem}"] = pd.to_datetime(final_df[f"{elem}"], yearfirst=True, format="%Y/%m/%d")
    
    #name = str(2020) + '_' + ','.join(month).replace(',', '_') + '_' + f'{taxi_color}' + '.csv'
    #print("Generating csv")
    #final_df.to_csv(name, index=False)
    
    return final_df

In [None]:
#yellowtaxis_2020_january = csv_to_df(2020, 'Yellow', 'January')
#yellowtaxis_2020_january

In [None]:
#tuples = [tuple(x) for x in yellowtaxis_2020_january.to_numpy()]
#tuples[:2]

In [None]:
asdafa

# DataBase creation

In [None]:
#establishing the connection
conn = psycopg2.connect(
   database="postgres", user='postgres', password='Snowdav3', host='127.0.0.1', port= '5432'
)

conn.autocommit = True

#Creating a cursor object using the cursor() method
cursor = conn.cursor()

#Preparing query to create a database
sql = '''CREATE database taxis''';

#Creating a database
cursor.execute(sql)
print("Database created successfully........")

#Closing the connection
conn.close()

In [None]:
# DataBase conection

In [None]:
param_dic = {
    "host"      : "127.0.0.1",
    "database"  : "taxis",
    "user"      : "postgres",
    "password"  : "Snowdav3"
}

In [None]:
def connect(params_dic):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print('Connection succesful')
    
    return conn

In [None]:
conn = connect(param_dic)

# Creating Table in the DataBase

In [None]:
def creating_table():
    cursor = conn.cursor()
    
    sql ='''CREATE TABLE yellowtaxis(
       VendorID float,
       tpep_pickup_datetime timestamp,
       tpep_dropoff_datetime timestamp,
       passenger_count float,
       trip_distance float,
       RatecodeID float,
       store_and_fwd_flag varchar,
       PULocationID float,
       DOLocationID float,
       payment_type float,
       fare_amount float,
       extra float,
       mta_tax float,
       tip_amount float,
       tolls_amount float,
       improvement_surcharge float,
       total_amount float,
       congestion_surcharge float
    );'''
    
    cursor.execute(sql)
    print("Table created")
    conn.commit()
    cursor.close()
    print("Cursor object closed")
    conn.rollback()
    conn.close()
    print("conn closed")

In [None]:
creating_table

# DataBase insertion

In [4]:
def execute_batch(#conn,
                  #table: str,
                  #page_size=100,
                  year: int,
                  taxi_color: str,
                  *month):
    
    print(month)
    '''
    df = csv_to_df(year, taxi_color, month)
    
    print("Converting DataFrame rows into tuples")
    tuples = [tuple(x) for x in df.to_numpy()]
    # Comma-separated dataframe columns
    cols = ','.join(list(df.columns))
    # SQL quert to execute
    query  = "INSERT INTO %s(%s) VALUES(%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s)" % (table, cols)
    cursor = conn.cursor()
    try:
        print(f"Inserting values into the {table} table")
        extras.execute_batch(cursor, query, tuples, page_size)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    print(f"Batch executed succesfully, all dataframe values inserted into the {table} table at {cursor.connection}")
    cursor.close()
    
    '''
    


In [3]:
execute_batch(2020, 'Yellow', 'January', 'February', 'March')

NameError: name 'execute_batch' is not defined

In [None]:
def execute_multiple_batch(year: int, taxi_color: str, *months, conn, df, table, page_size=100):
    pass