In [1]:
import dask.dataframe as dd
from dask.distributed import Client
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import pyarrow.compute as pc
import gc
from decimal import Decimal  # Add this import statement



In [2]:
speriod=int(input("Enter the simulation period: "))
samples=int(input("Enter the number of samples: "))

In [3]:
# Define the folder containing the Parquet files
folder_path = r'D:\RISHIN\13_ILC_TASK1\input\PARQUET_FILES'

# List all Parquet files in the folder
parquet_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.parquet')]

In [4]:
output_folder_path = input("Enter the output folder path: ")


In [6]:

# Check if there are any Parquet files in the folder
if parquet_files:
    # Read the first Parquet file in chunks
    parquet_file = pq.ParquetFile(parquet_files[0])
    for batch in parquet_file.iter_batches(batch_size=1000):
        # Convert the first batch to a PyArrow Table
        table = pa.Table.from_batches([batch])
        
        # Convert the PyArrow Table to a Pandas DataFrame
        df = table.to_pandas()
        
        # Extract the first value of LocationName and split it by '_'
        location_name = df['LocationName'].iloc[0]
        country = location_name.split('_')[0]
        
        
        # Define the main folder path
        main_folder_path = os.path.join(output_folder_path, f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_Losses')
        
        # Define subfolders
        subfolders = ['EP', 'PLT', 'STATS']
        nested_folders = ['Lob', 'Portfolio']
        innermost_folders = ['GR', 'GU']
        
        # Create the main folder and subfolders
        for subfolder in subfolders:
            subfolder_path = os.path.join(main_folder_path, subfolder)
            os.makedirs(subfolder_path, exist_ok=True)
            
            for nested_folder in nested_folders:
                nested_folder_path = os.path.join(subfolder_path, nested_folder)
                os.makedirs(nested_folder_path, exist_ok=True)
                
                for innermost_folder in innermost_folders:
                    innermost_folder_path = os.path.join(nested_folder_path, innermost_folder)
                    os.makedirs(innermost_folder_path, exist_ok=True)
        
        print(f"Folders created successfully at {main_folder_path}")
        break  # Process only the first batch
else:
    print("No Parquet files found in the specified folder.")

Folders created successfully at D:\RISHIN\ILC_TEST\ILC2024_EUWS_PLA_WI_EP_BE_EUR_Losses


In [7]:
# For EP LOB GU 

In [13]:
# Initialize an empty list to store the results
final_grouped_tables = []
# Process each Parquet file individually
for file in parquet_files:
    # Read the Parquet file into a PyArrow Table
    table = pq.read_table(file)
    
    # Perform the aggregation: sum the Loss column grouped by EventId, PeriodId, and LobName
    grouped_table = table.group_by(['EventId', 'PeriodId', 'LobName']).aggregate([('Loss', 'sum')])
    
    # Rename the aggregated column to Sum_Loss
    grouped_table = grouped_table.rename_columns(['EventId', 'PeriodId', 'LobName', 'Sum_Loss'])
    
    # Append the grouped Table to the final_grouped_tables list
    final_grouped_tables.append(grouped_table)

# Concatenate all grouped tables
final_table = pa.concat_tables(final_grouped_tables)


KeyboardInterrupt: 

In [9]:
final_df = final_table.to_pandas()

duplicate_records = final_df.duplicated(subset=['EventId', 'PeriodId', 'LobName'], keep=False)

num_duplicates = duplicate_records.sum()

print(f"Number of duplicate records: {num_duplicates}")

Number of duplicate records: 760


In [11]:

duplicate_mask = final_df.duplicated(subset=['EventId', 'PeriodId', 'LobName'], keep=False)
duplicate_rows_df = final_df[duplicate_mask]

print(final_df.nunique())

print(duplicate_rows_df)

EventId       27633
PeriodId     248280
LobName           5
Sum_Loss    7333970
dtype: int64
          EventId  PeriodId LobName      Sum_Loss
146301   53856487      9495    AUTO  3.171431e+07
146361   53884900      3306    AUTO  7.319426e+07
146363   53871525      3321    AUTO  3.847942e+07
146382   53884900      3306     COM  1.409573e+08
146392   53871525      3321     COM  7.672723e+07
...           ...       ...     ...           ...
7296249  53863133    153629     COM  8.515380e+07
7296254  53879975    153637     AGR  2.619929e+05
7296269  53877262    153626     IND  3.586264e+04
7296270  53877262    153626    SPER  4.707738e+04
7296308  53863133    153629    AUTO  2.649277e+07

[760 rows x 4 columns]


In [12]:
duplicate_rows_df

Unnamed: 0,EventId,PeriodId,LobName,Sum_Loss
146301,53856487,9495,AUTO,3.171431e+07
146361,53884900,3306,AUTO,7.319426e+07
146363,53871525,3321,AUTO,3.847942e+07
146382,53884900,3306,COM,1.409573e+08
146392,53871525,3321,COM,7.672723e+07
...,...,...,...,...
7296249,53863133,153629,COM,8.515380e+07
7296254,53879975,153637,AGR,2.619929e+05
7296269,53877262,153626,IND,3.586264e+04
7296270,53877262,153626,SPER,4.707738e+04


In [10]:
final_df.nunique()

EventId       27633
PeriodId     248280
LobName           5
Sum_Loss    7333970
dtype: int64

In [None]:



# Perform final grouping and sorting
final_grouped_table = final_table.group_by(['EventId', 'PeriodId', 'LobName']).aggregate([('Sum_Loss', 'sum')])
sorted_final_table = final_grouped_table.sort_by([('Sum_Loss_sum', 'descending')])
# The Table is now ready for the next instructions
dataframe_1 = sorted_final_table
dataframe_1= dataframe_1.to_pandas()
if not dataframe_1[dataframe_1['LobName'] == 'AGR'].empty:
    daf_AGR = dataframe_1[dataframe_1['LobName'] == 'AGR']

if not dataframe_1[dataframe_1['LobName'] == 'AUTO'].empty:
    daf_AUTO = dataframe_1[dataframe_1['LobName'] == 'AUTO']

if not dataframe_1[dataframe_1['LobName'] == 'COM'].empty:
    daf_COM = dataframe_1[dataframe_1['LobName'] == 'COM']

if not dataframe_1[dataframe_1['LobName'] == 'IND'].empty:
    daf_IND = dataframe_1[dataframe_1['LobName'] == 'IND']

if not dataframe_1[dataframe_1['LobName'] == 'SPER'].empty:
    daf_SPER = dataframe_1[dataframe_1['LobName'] == 'SPER']

if not dataframe_1[dataframe_1['LobName'] == 'FRST'].empty:
    daf_FRST = dataframe_1[dataframe_1['LobName'] == 'FRST']

if not dataframe_1[dataframe_1['LobName'] == 'GLH'].empty:
    daf_GLH = dataframe_1[dataframe_1['LobName'] == 'GLH']
