<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [14]</a>'.</span>

# Papermill Report Generator

In [1]:
import os
import pandas as pd
import numpy as np
import plotnine as pn
import seaborn as sns
import datetime as dt
import matplotlib.pyplot as plt
import pdfkit



In [2]:
#Check Dataframe Utility function
def check_df(dataframe, sample=False):
    
    print(f"Dataframe Shape: {dataframe.shape} with rows: {dataframe.shape[0]} and columns: {dataframe.shape[1]}")
    print(f"\nDF Columns: \n{list(dataframe.columns)}")
    if sample == True:
        print(f"\nData:\n{dataframe.head(5)}")
    
    return None

In [3]:
#Define the default parameters
analysis = "listings"

In [4]:
# Parameters
analysis = "listings"


In [5]:
#Import the data
def import_data(analysis, folder_path=None):
    
    if not folder_path:
        folder_path = os.path.abspath(".")
        data_dir = 'data'
        folder_path = os.path.join(folder_path, data_dir)
        
    if analysis == 'listings':
        filename = 'listings.csv'
    elif analysis == 'reviews':
        filename = 'reviews.csv'
    elif analysis == 'calendar':
        filename = 'calendar.csv'
    
    filepath = os.path.join(folder_path, filename)
    df = pd.read_csv(filepath)
    check_df(df)
    
    return df

In [6]:
## Data cleaning Listings

@np.vectorize
def remove_dollar(label: str):
    return float(label.replace('$','').replace(',',''))

if analysis == 'listings':
    
    #Import dei dati
    df = import_data(analysis)
    
    # Selezioniamo solo alcune delle colonne
    listings = df[[
        'id','name','longitude','latitude',
        'listing_url',
        'instant_bookable',
        'host_response_time',
        'review_scores_rating',
        'property_type',
        'room_type','accommodates',
        'bathrooms','bedrooms','beds','reviews_per_month','amenities',
        'number_of_reviews',
        'price'
    ]]
    #listings['price'] = remove_dollar(listings['price'])
    listings = listings.assign(price = remove_dollar(listings.price))
    listings[['price']]
    print("Listings dataset readed and parsed")
    df_clean = listings.copy()

Dataframe Shape: (27647, 74) with rows: 27647 and columns: 74

DF Columns: 
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'description', 'neighborhood_overview', 'picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability', '

In [7]:
## Data cleaning Reviews

if analysis == 'reviews':
    
    #Import dei dati
    df = import_data(analysis)
    
    #Date to datetime
    reviews = df.assign(date = pd.to_datetime(df['date']))
    reviews['year'] = reviews['date'].dt.year
    reviews['month'] = reviews['date'].dt.month
    reviews = reviews.sort_values(['year', 'month'], ascending=False)
    print("Reviews dataset readed and parsed")
    df_clean = reviews.copy()
    
    

In [8]:
## Data cleaning Calendar

if analysis == 'calendar':
    
    # Import dei dati
    df = import_data(analysis)
    
    calendar = df.assign(date = pd.to_datetime(df['date']))
    calendar = calendar.assign(
        price          = pd.to_numeric(calendar.price.str.replace('$','').str.replace(',','')),
        # adjusted_price = pd.to_numeric(calendar.adjusted_price.str.replace('$','').str.replace(',','')),
    )
    calendar['year'] = pd.DatetimeIndex(calendar['date']).year
    calendar['month'] = pd.DatetimeIndex(calendar['date']).month
    calendar = calendar.sort_values(['year', 'month'], ascending=False)
    calendar['available'] = calendar.available.map({
        't': True,
        'f': False
    })
    print("Calendar dataset readed and parsed")
    df_clean = calendar.copy()

# 2. Generate analysis and plots

In [9]:
# Simple Analysis Generation
if analysis == 'listings':
    room_type_count = (
                df_clean.groupby("room_type", dropna=False)
                .id.count()
                .reset_index()
                .rename(columns={"id": "listing_count"})
            )
    night_price = df_clean.agg({"price": [np.mean]})
    night_price_room = df_clean.groupby("room_type").agg(
                {"price": [np.mean]}
            )
elif analysis == 'reviews':
    pass
elif analysis == 'calendar':
    pass

In [10]:
# Simply Plot Generation
if analysis == 'listings':
    fig1 = (
            pn.ggplot(df_clean)
            + pn.aes(x='room_type', fill='room_type')
            + pn.geom_bar()
            + pn.theme(axis_text_x=pn.element_text(angle=45, hjust=1))
        )
    fig1_path = os.path.join(os.path.abspath('.'),'plot1.png')
    fig1.save(filename=fig1_path)
    
    fig2 = (
            pn.ggplot(df_clean)
            + pn.aes(x="price")
            + pn.geom_histogram(fill="blue", colour="black", bins=30)
            + pn.xlim(0, 200)
        )
    fig2_path = os.path.join(os.path.abspath('.'),'plot2.png')
    fig2.save(filename=fig2_path)
    
elif analysis == 'reviews':
    pass
elif analysis == 'calendar':
    pass





# 3. Creating the final PDF Report

In [11]:
# Defining start and send date for the analysis
today = str(dt.date.today()).replace('-', '/')


In [12]:
# HTML template to add our data and plots
report_template = f'''
<!DOCTYPE html>
    <html>
      <head>
        <meta charset='utf-8'>
        <title>PythonBiellaGroup Report Example</title>
        <link rel='stylesheet' href='report.css'>
          <style>
          h1 {{
          font-family: Arial;
          font-size: 300%;
          }}
          h2 {{
          font-family: Arial;
          font-size: 200%;
          }}
          @page {{
          size: 7in 9.25in;
          margin: 27mm 16mm 27mm 16mm;
          }}
          </style>                       
      </head>
      <h1 align="center">Analysis for: {analysis}</h1>
      <h2 align="center">Report date: {today}</h2>      
        
      <figure>
        <img src="{fig1_path}" width="1200" height="600">
      </figure>
      <figure>
        <img src="{fig2_path}" width="1200" height="600">
      </figure>      
    </html>
'''

In [13]:
# Save HTML string to file
html_report = os.path.join(os.path.abspath("."),f"{analysis.split(',')[0].replace(' ','_')}_report.html")
with open(html_report, "w") as r:
    r.write(report_template)

Be carefull! To use pdfkit with html report export to pdf you need to install on your machine: `wkhtmltopdf`
- https://stackoverflow.com/questions/27673870/cant-create-pdf-using-python-pdfkit-error-no-wkhtmltopdf-executable-found

<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [14]:
# Use pdfkit to create the pdf report from the 
pdfkit.from_file(html_report, os.path.join(os.path.abspath("."),f"{analysis.split(',')[0].replace(' ', '_')}_report.pdf"))

OSError: wkhtmltopdf reported an error:
Loading pages (1/6)
[>                                                           ] 0%[======>                                                     ] 10%Warning: Blocked access to file                                   
Warning: Blocked access to file /Users/jeydi/Dropbox/Progetti/PERSONALI/docker-starter-kit/python/streamlit-example/plot1.png
Warning: Blocked access to file /Users/jeydi/Dropbox/Progetti/PERSONALI/docker-starter-kit/python/streamlit-example/plot2.png
[======>                                                     ] 11%Error: Failed to load about:blank, with network status code 301 and http status code 0 - Protocol "about" is unknown
Error: Failed to load about:blank, with network status code 301 and http status code 0 - Protocol "about" is unknown
Error: Failed to load about:blank, with network status code 301 and http status code 0 - Protocol "about" is unknown
[============================================================] 100%Counting pages (2/6)                                               
[============================================================] Object 1 of 1Resolving links (4/6)                                                       
[============================================================] Object 1 of 1Loading headers and footers (5/6)                                           
Printing pages (6/6)
[>                                                           ] Preparing[============================================================] Page 1 of 1Done                                                                      
Exit with code 1 due to network error: ProtocolUnknownError
