## **PVSyst Report Scraper:**

Search for specific measures in PVSyst Report PDF Files.
<br><ul>This version looks for the DC Power (KWp), Speciffic Production (kWh/kWp), Produced Energy* and Performance Ratio (%) in the reports.
<br><sub>*Since there are many possible configurations for each PVSyst report, a column with the calculated energy was added in the final dataframe <sub>

### Import

In [None]:
import re
from pathlib import Path
from PyPDF2 import PdfReader
import pandas as pd
import os
from tkinter.filedialog import askdirectory

### Set Report Directory
Define the directory where the PDF files are located.

In [None]:
# By UI selection
your_path = askdirectory(title='Open Directory')
directory = Path(your_path)
directory

In [None]:
# By pasting the path
your_path = r'\\location' # Replace `\\location` with the right path
directory = Path(your_path)
directory

### Check for files inside direcotry

In [None]:
files_list = []
for filename in directory.glob('*.pdf'):
    # open the PDF file
    files_list.append(str(filename).split('\\')[-1])

print('Count of files:',len(files_list))
files_list

### Scraper

In [None]:
# Define the patterns to search for
pattern_kwp = '(\d+)\s+kWp' # DC power
pattern_sp = r'Specific production (\d+) kWh/kWp/year' # Specific production (EN)
pattern_sp1 = r'Specific prod. (\d+) kWh/kWp/year' # Specific production abreviated (EN)
pattern_sp_pt = r'Produção específica (\d+) kWh/kWp/ano' # Specific production (PT)
pattern_ener = r'Produced Energy (\d+)' # Produced Energy (EN)
pattern_ener_gw_pt = r'Energia produzida (\d+)' # Produced Energy (PT)
pattern_pr = r'\d+\.\d+\s*%' # Production Ratio

files = []
dc_pwr = []
spec_prod = []
ener = []
pr = []
errors = []

# Loop through all the PDF files in the directory
for full_file_path in directory.glob('*.pdf'):
    # open the PDF file
    try:
        with open(full_file_path, 'rb') as file: # Open the PDF file
            
            file_name = str(full_file_path).split('\\')[-1]
            
            pdf_reader = PdfReader(file) # Instance the file as a PdfReader object
            # And extract the text from pages 0, 1 and 2
            text0 = pdf_reader.pages[0].extract_text()
            text = pdf_reader.pages[1].extract_text() + pdf_reader.pages[2].extract_text()
            
            # Search for the patterns in the texts
            match_kwp = re.search(pattern_kwp, text0)
            match_sp = re.search(pattern_sp, text)
            match_sp1 = re.search(pattern_sp1, text)
            match_sp_pt = re.search(pattern_sp_pt, text)
            match_ener = re.search(pattern_ener, text)
            match_ener_gw_pt = re.search(pattern_ener_gw_pt, text)
            match_pr = re.search(pattern_pr, text)
                
            # print the file name and the matched number
            files.append(file_name)
            
            
            # Append the matched values to the respective lists, else append 'error'
            if match_kwp:
                dc_pwr.append(match_kwp.group(1))
            else:
                dc_pwr.append('error')
            #
            if match_sp:
                spec_prod.append(match_sp.group(1))
            elif match_sp1:
                spec_prod.append(match_sp1.group(1))
            elif match_sp_pt:
                spec_prod.append(match_sp_pt.group(1))
            else:
                spec_prod.append('error')
            #
            if match_ener:
                ener.append(match_ener.group(1))
            elif match_ener_gw_pt:
                ener.append(match_ener_gw_pt.group(1))
            else:
                ener.append("error")
            #
            if match_pr:
                pr.append(match_pr.group())
            else:
                pr.append('error')
    except Exception as e:
        errors.append(file_name)

result_df = pd.DataFrame(data=zip(files, dc_pwr, spec_prod, ener, pr), columns='File DC_Power Specific_Production Injected_Energy PR'.split())
result_df['Calculated_Energy'] = pd.to_numeric(result_df['DC_Power'])*pd.to_numeric(result_df['Specific_Production'])/1000

In [None]:
result_df

In [None]:
errors

In [None]:
def open_errors(error_list, dir, limit=10) -> None:
    '''
    Open externaly the files that couldn't be scraped by the code.
    Limit of openned files defined by arg 'limit' (10 by default)
    '''
    for i, file_name in enumerate(error_list):
        if not i >= limit:
            os.startfile(Path(str(dir)+'\\'+file_name))
    return

open_errors(errors, directory)
