In [2]:
import dask.dataframe as dd
from dask.distributed import Client
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import pyarrow.compute as pc
import gc


In [3]:
from concurrent.futures import ThreadPoolExecutor
import shutil



In [4]:
speriod=int(input("Enter the simulation period: "))
samples=int(input("Enter the number of samples: "))

In [5]:
# Define the folder containing the Parquet files
folder_path = r'D:\RISHIN\13_ILC_TASK1\input\PARQUET_FILES'

# List all Parquet files in the folder
parquet_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.parquet')]

In [6]:
output_folder_path = input("Enter the output folder path: ")


In [179]:

# Check if there are any Parquet files in the folder
if parquet_files:
    # Read the first Parquet file in chunks
    parquet_file = pq.ParquetFile(parquet_files[0])
    for batch in parquet_file.iter_batches(batch_size=1000):
        # Convert the first batch to a PyArrow Table
        table = pa.Table.from_batches([batch])
        
        # Convert the PyArrow Table to a Pandas DataFrame
        df = table.to_pandas()
        
        # Extract the first value of LocationName and split it by '_'
        location_name = df['LocationName'].iloc[0]
        country = location_name.split('_')[0]
        
        
        # Define the main folder path
        main_folder_path = os.path.join(output_folder_path, f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_Losses')
        
        # Define subfolders
        subfolders = ['EP', 'PLT', 'STATS']
        nested_folders = ['Lob', 'Portfolio']
        innermost_folders = ['GR', 'GU']
        
        # Create the main folder and subfolders
        for subfolder in subfolders:
            subfolder_path = os.path.join(main_folder_path, subfolder)
            os.makedirs(subfolder_path, exist_ok=True)
            
            for nested_folder in nested_folders:
                nested_folder_path = os.path.join(subfolder_path, nested_folder)
                os.makedirs(nested_folder_path, exist_ok=True)
                
                for innermost_folder in innermost_folders:
                    innermost_folder_path = os.path.join(nested_folder_path, innermost_folder)
                    os.makedirs(innermost_folder_path, exist_ok=True)
        
        print(f"Folders created successfully at {main_folder_path}")
        break  # Process only the first batch
else:
    print("No Parquet files found in the specified folder.")

Folders created successfully at D:\RISHIN\Rough\ILC2024_EUWS_PLA_WI_EP_BE_EUR_Losses


In [None]:
# For EP LOB GU 

In [15]:


# Initialize an empty list to store the results
final_grouped_tables = []
# Process each Parquet file individually
for file in parquet_files:
    # Read the Parquet file into a PyArrow Table
    table = pq.read_table(file)
    
    # Perform the aggregation: sum the Loss column grouped by EventId, PeriodId, and LobName
    grouped_table = table.group_by(['EventId', 'PeriodId', 'LobName']).aggregate([('Loss', 'sum')])
    
    # Rename the aggregated column to Sum_Loss
    grouped_table = grouped_table.rename_columns(['EventId', 'PeriodId', 'LobName', 'Sum_Loss'])
    
    # Append the grouped Table to the final_grouped_tables list
    final_grouped_tables.append(grouped_table)

# Concatenate all grouped tables
final_table = pa.concat_tables(final_grouped_tables)

# Perform final grouping and sorting
final_grouped_table = final_table.group_by(['EventId', 'PeriodId', 'LobName']).aggregate([('Sum_Loss', 'sum')])
sorted_final_table = final_grouped_table.sort_by([('Sum_Loss_sum', 'descending')])
# The Table is now ready for the next instructions
dataframe_1 = sorted_final_table

In [16]:
dataframe_1= dataframe_1.to_pandas()


In [28]:
if not dataframe_1[dataframe_1['LobName'] == 'AGR'].empty:
    daf_AGR = dataframe_1[dataframe_1['LobName'] == 'AGR']

if not dataframe_1[dataframe_1['LobName'] == 'AUTO'].empty:
    daf_AUTO = dataframe_1[dataframe_1['LobName'] == 'AUTO']

if not dataframe_1[dataframe_1['LobName'] == 'COM'].empty:
    daf_COM = dataframe_1[dataframe_1['LobName'] == 'COM']

if not dataframe_1[dataframe_1['LobName'] == 'IND'].empty:
    daf_IND = dataframe_1[dataframe_1['LobName'] == 'IND']

if not dataframe_1[dataframe_1['LobName'] == 'SPER'].empty:
    daf_SPER = dataframe_1[dataframe_1['LobName'] == 'SPER']

if not dataframe_1[dataframe_1['LobName'] == 'FRST'].empty:
    daf_FRST = dataframe_1[dataframe_1['LobName'] == 'FRST']

if not dataframe_1[dataframe_1['LobName'] == 'GLH'].empty:
    daf_GLH = dataframe_1[dataframe_1['LobName'] == 'GLH']

In [25]:

def process_and_save_parquet(dataframe_1, parquet_file_path, speriod, samples):
    # Initialize dataframe_2 by selecting PeriodId and max(Sum_Loss) grouped by PeriodId
    dataframe_2 = dataframe_1.groupby(['PeriodId', 'LobName'], as_index=False).agg({'Sum_Loss_sum': 'max'})

    # Rename the aggregated column to Max_Loss
    dataframe_2.rename(columns={'Sum_Loss_sum': 'Max_Loss'}, inplace=True)

    # Sort dataframe_2 by Max_Loss in descending order
    dataframe_2 = dataframe_2.sort_values(by='Max_Loss', ascending=False)

    # Initialize dataframe_3 by selecting PeriodId and Sum(Sum_Loss) grouped by PeriodId
    dataframe_3 = dataframe_1.groupby(['PeriodId', 'LobName'], as_index=False).agg({'Sum_Loss_sum': 'sum'})

    # Rename the aggregated column to Sum_Loss
    dataframe_3.rename(columns={'Sum_Loss_sum': 'S_Sum_Loss'}, inplace=True)

    # Sort dataframe_3 by S_sum_Loss in descending order
    dataframe_3 = dataframe_3.sort_values(by='S_Sum_Loss', ascending=False)

    dataframe_2['rate'] = (1 / (speriod * samples))

    # Calculate the cumulative rate column and round to 6 decimal places
    dataframe_2['cumrate'] = dataframe_2['rate'].cumsum()

    # Calculate the RPs column and round to 6 decimal places
    dataframe_2['RPs'] = (1 / dataframe_2['cumrate'])

    # Calculate the TCE_OEP_1 column and round to 6 decimal places
    dataframe_2['TCE_OEP_1'] = ((dataframe_2['Max_Loss'] - dataframe_2['Max_Loss'].shift(-1)) * 
                              (dataframe_2['cumrate'] + dataframe_2['cumrate'].shift(-1)) * 0.5)

    # Calculate the TCE_OEP_2 column and round to 6 decimal places
    dataframe_2['TCE_OEP_2'] = (dataframe_2['TCE_OEP_1'].shift().cumsum() * dataframe_2['RPs'])

    # Calculate the TCE_OEP_Final column and round to 6 decimal places
    dataframe_2['TCE_OEP_Final'] = (dataframe_2['TCE_OEP_2'] + dataframe_2['Max_Loss'])

    dataframe_3['rate'] = (1 / (speriod * samples))

    # Calculate the cumulative rate column and round to 6 decimal places
    dataframe_3['cumrate'] = dataframe_3['rate'].cumsum()

    # Calculate the RPs column and round to 6 decimal places
    dataframe_3['RPs'] = (1 / dataframe_3['cumrate'])

    # Calculate the TCE_AEP_1 column and round to 6 decimal places
    dataframe_3['TCE_AEP_1'] = ((dataframe_3['S_Sum_Loss'] - dataframe_3['S_Sum_Loss'].shift(-1)) * 
                              (dataframe_3['cumrate'] + dataframe_3['cumrate'].shift(-1)) * 0.5)

    # Calculate the cumulative sum up to the previous row and multiply by the current row's RPs, then round to 6 decimal places
    dataframe_3['TCE_AEP_2'] = (dataframe_3['TCE_AEP_1'].shift().cumsum() * dataframe_3['RPs'])

    # Calculate the TCE_AEP_Final column and round to 6 decimal places
    dataframe_3['TCE_AEP_Final'] = (dataframe_3['TCE_AEP_2'] + dataframe_3['S_Sum_Loss'])

    # Define the list of RPs values to filter and convert them to float
    rps_values = [float(x) for x in [10000, 5000, 1000, 500, 250, 200, 100, 50, 25, 10, 5, 2]]

    # Initialize an empty DataFrame to store the filtered results
    fdataframe_2 = pd.DataFrame()
    fdataframe_3 = pd.DataFrame()

    # Define the number of decimal places to round to
    decimal_places = 8

    # Loop through each value in rps_values and filter the DataFrames
    for value in rps_values:
        rounded_value = round(value, decimal_places)
        fdataframe_2 = pd.concat([fdataframe_2, dataframe_2[np.round(dataframe_2['RPs'], decimal_places) == rounded_value]])
        fdataframe_3 = pd.concat([fdataframe_3, dataframe_3[np.round(dataframe_3['RPs'], decimal_places) == rounded_value]])

    fdataframe_3.rename(columns={'S_Sum_Loss': 'AEP', 'TCE_AEP_Final': 'TCE-AEP'}, inplace=True)
    fdataframe_2.rename(columns={'Max_Loss': 'OEP', 'TCE_OEP_Final': 'TCE-OEP'}, inplace=True)

    # Define the mapping of LobName to LobId
    lobname_to_lobid = {
        'AGR': "1",
        'AUTO': "2",
        'COM': "3",
        'IND': "4",
        'SPER': "5",
        'FRST': "6",
        'GLH': "7"
    }

    # Add the LobId column to fdataframe_2
    fdataframe_2['LobId'] = fdataframe_2['LobName'].map(lobname_to_lobid)

    # Add the LobId column to fdataframe_3
    fdataframe_3['LobId'] = fdataframe_3['LobName'].map(lobname_to_lobid)

    # Define the columns to be used in the new DataFrame for fdataframe_3
    columns_to_keep_3 = ['RPs', 'LobId', 'LobName']
    columns_to_melt_3 = ['AEP', 'TCE-AEP']

    # Melt fdataframe_3 to reshape it
    melted_df_3 = fdataframe_3.melt(id_vars=columns_to_keep_3, value_vars=columns_to_melt_3, 
                                    var_name='EPType', value_name='Loss')

    # Rename columns to match the desired output
    melted_df_3.rename(columns={'RPs': 'ReturnPeriod'}, inplace=True)

    # Reorder columns
    final_df_3 = melted_df_3[['EPType', 'Loss', 'ReturnPeriod', 'LobId', 'LobName']]

    # Define the columns to be used in the new DataFrame for fdataframe_2
    columns_to_keep_2 = ['RPs', 'LobId', 'LobName']
    columns_to_melt_2 = ['OEP', 'TCE-OEP']

    # Melt fdataframe_2 to reshape it
    melted_df_2 = fdataframe_2.melt(id_vars=columns_to_keep_2, value_vars=columns_to_melt_2, 
                                    var_name='EPType', value_name='Loss')

    # Rename columns to match the desired output
    melted_df_2.rename(columns={'RPs': 'ReturnPeriod'}, inplace=True)

    # Reorder columns
    final_df_2 = melted_df_2[['EPType', 'Loss', 'ReturnPeriod', 'LobId', 'LobName']]

    # Concatenate the two DataFrames
    final_df_EP_LOB_GU = pd.concat([final_df_2, final_df_3], ignore_index=True)

    # Define the new order for EPType
    new_ep_type_order = ["OEP", "AEP", "TCE-OEP", "TCE-AEP"]

    # Update the EPType column to the new order
    final_df_EP_LOB_GU['EPType'] = pd.Categorical(final_df_EP_LOB_GU['EPType'], categories=new_ep_type_order, ordered=True)

    # Sort the DataFrame by EPType and then by ReturnPeriod in descending order within each EPType
    final_df_EP_LOB_GU = final_df_EP_LOB_GU.sort_values(by=['EPType', 'ReturnPeriod'], ascending=[True, False]).reset_index(drop=True)

    # Save final_df as a Parquet file
    final_df_EP_LOB_GU.to_parquet(parquet_file_path, index=False)

    print(f"Parquet file saved successfully at {parquet_file_path}")



In [26]:
pq_file_path_1=os.path.join(main_folder_path, 'EP', 'Lob', 'GU', f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_EP_Lob_GU_0.parquet')

pq_file_path_2=os.path.join(main_folder_path, 'EP', 'Lob', 'GU', f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_EP_Lob_GU_1.parquet')

pq_file_path_3=os.path.join(main_folder_path, 'EP', 'Lob', 'GU', f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_EP_Lob_GU_2.parquet')

pq_file_path_4=os.path.join(main_folder_path, 'EP', 'Lob', 'GU', f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_EP_Lob_GU_3.parquet')

pq_file_path_5=os.path.join(main_folder_path, 'EP', 'Lob', 'GU', f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_EP_Lob_GU_4.parquet')

pq_file_path_6=os.path.join(main_folder_path, 'EP', 'Lob', 'GU', f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_EP_Lob_GU_5.parquet')

pq_file_path_7=os.path.join(main_folder_path, 'EP', 'Lob', 'GU', f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_EP_Lob_GU_6.parquet')




In [29]:
try:
    process_and_save_parquet(daf_AGR, pq_file_path_1, speriod, samples)
except NameError:
    pass

try:
    process_and_save_parquet(daf_AUTO, pq_file_path_2, speriod, samples)
except NameError:
    pass

try:
    process_and_save_parquet(daf_COM, pq_file_path_3, speriod, samples)
except NameError:
    pass

try:
    process_and_save_parquet(daf_IND, pq_file_path_4, speriod, samples)
except NameError:
    pass

try:
    process_and_save_parquet(daf_SPER, pq_file_path_5, speriod, samples)
except NameError:
    pass

try:
    process_and_save_parquet(daf_FRST, pq_file_path_6, speriod, samples)
except NameError:
    pass

try:
    process_and_save_parquet(daf_GLH, pq_file_path_7, speriod, samples)
except NameError:
    pass

Parquet file saved successfully at D:\RISHIN\Rough\ILC2024_EUWS_PLA_WI_EP_BE_EUR_Losses\EP\Lob\GU\ILC2024_EUWS_PLA_WI_EP_BE_EUR_EP_Lob_GU_0.parquet
Parquet file saved successfully at D:\RISHIN\Rough\ILC2024_EUWS_PLA_WI_EP_BE_EUR_Losses\EP\Lob\GU\ILC2024_EUWS_PLA_WI_EP_BE_EUR_EP_Lob_GU_1.parquet
Parquet file saved successfully at D:\RISHIN\Rough\ILC2024_EUWS_PLA_WI_EP_BE_EUR_Losses\EP\Lob\GU\ILC2024_EUWS_PLA_WI_EP_BE_EUR_EP_Lob_GU_2.parquet
Parquet file saved successfully at D:\RISHIN\Rough\ILC2024_EUWS_PLA_WI_EP_BE_EUR_Losses\EP\Lob\GU\ILC2024_EUWS_PLA_WI_EP_BE_EUR_EP_Lob_GU_3.parquet
Parquet file saved successfully at D:\RISHIN\Rough\ILC2024_EUWS_PLA_WI_EP_BE_EUR_Losses\EP\Lob\GU\ILC2024_EUWS_PLA_WI_EP_BE_EUR_EP_Lob_GU_4.parquet


In [221]:
#now for EP lob portfoilio GU

In [234]:


# Initialize an empty list to store the results
final_grouped_tables = []

# Process each Parquet file individually
for file in parquet_files:
    # Read the Parquet file into a PyArrow Table
    table = pq.read_table(file)
    
    # Perform the aggregation: sum the Loss column grouped by EventId, PeriodId, and LobName
    grouped_table = table.group_by(['EventId', 'PeriodId']).aggregate([('Loss', 'sum')])
    
    # Rename the aggregated column to Sum_Loss
    grouped_table = grouped_table.rename_columns(['EventId', 'PeriodId', 'Sum_Loss'])
    
    # Append the grouped Table to the final_grouped_tables list
    final_grouped_tables.append(grouped_table)

# Concatenate all grouped tables
final_table = pa.concat_tables(final_grouped_tables)

# Perform final grouping and sorting
final_grouped_table = final_table.group_by(['EventId', 'PeriodId']).aggregate([('Sum_Loss', 'sum')])
sorted_final_table = final_grouped_table.sort_by([('Sum_Loss_sum', 'descending')])
# The Table is now ready for the next instructions
dataframe_1 = sorted_final_table
dataframe_1= dataframe_1.to_pandas()
#dataframe_1 = dataframe_1[dataframe_1['LobName'] == 'AUTO']

# Initialize dataframe_2 by selecting PeriodId and max(Sum_Loss) grouped by PeriodId
dataframe_2 = dataframe_1.groupby(['PeriodId'], as_index=False).agg({'Sum_Loss_sum': 'max'})

# Rename the aggregated column to Max_Loss
dataframe_2.rename(columns={'Sum_Loss_sum': 'Max_Loss'}, inplace=True)

# Sort dataframe_2 by Max_Loss in descending order
dataframe_2 = dataframe_2.sort_values(by='Max_Loss', ascending=False)

# Initialize dataframe_2 by selecting PeriodId and Sum(Sum_Loss) grouped by PeriodId
dataframe_3 = dataframe_1.groupby(['PeriodId'], as_index=False).agg({'Sum_Loss_sum': 'sum'})

# Rename the aggregated column to Sum_Loss
dataframe_3.rename(columns={'Sum_Loss_sum': 'S_Sum_Loss'}, inplace=True)

# Sort dataframe_3 by S_sum_Loss in descending order
dataframe_3 = dataframe_3.sort_values(by='S_Sum_Loss', ascending=False)

#dataframe_2['Max_Loss'] = dataframe_2['Max_Loss'].round(5)

dataframe_2['rate'] = (1 / (speriod * samples))

# Calculate the cumulative rate column and round to 6 decimal places
dataframe_2['cumrate'] = dataframe_2['rate'].cumsum()

# Calculate the RPs column and round to 6 decimal places
dataframe_2['RPs'] = (1 / dataframe_2['cumrate'])




# Calculate the TCE_OEP_1 column and round to 6 decimal places
dataframe_2['TCE_OEP_1'] = ((dataframe_2['Max_Loss'] - dataframe_2['Max_Loss'].shift(-1)) * 
                          (dataframe_2['cumrate'] + dataframe_2['cumrate'].shift(-1)) * 0.5)

# Calculate the TCE_OEP_2 column and round to 6 decimal places
dataframe_2['TCE_OEP_2'] = (dataframe_2['TCE_OEP_1'].shift().cumsum() * dataframe_2['RPs'])


# Calculate the TCE_OEP_Final column and round to 6 decimal places
dataframe_2['TCE_OEP_Final'] = (dataframe_2['TCE_OEP_2'] + dataframe_2['Max_Loss'])

#dataframe_3['S_Sum_Loss'] = dataframe_3['S_Sum_Loss'].round(5)

# Calculate the rate column and round to 6 decimal places
dataframe_3['rate'] = (1 / (speriod * samples))

# Calculate the cumulative rate column and round to 6 decimal places
dataframe_3['cumrate'] = dataframe_3['rate'].cumsum()

# Calculate the RPs column and round to 6 decimal places
dataframe_3['RPs'] = (1 / dataframe_3['cumrate'])


# Calculate the TCE_AEP_1 column and round to 6 decimal places
dataframe_3['TCE_AEP_1'] = ((dataframe_3['S_Sum_Loss'] - dataframe_3['S_Sum_Loss'].shift(-1)) * 
                          (dataframe_3['cumrate'] + dataframe_3['cumrate'].shift(-1)) * 0.5)

# Calculate the cumulative sum up to the previous row and multiply by the current row's RPs, then round to 6 decimal places
dataframe_3['TCE_AEP_2'] = (dataframe_3['TCE_AEP_1'].shift().cumsum() * dataframe_3['RPs'])

# Calculate the TCE_AEP_Final column and round to 6 decimal places
dataframe_3['TCE_AEP_Final'] = (dataframe_3['TCE_AEP_2'] + dataframe_3['S_Sum_Loss'])

# Define the list of RPs values to filter and convert them to float
rps_values = [float(x) for x in [10000, 5000, 1000, 500, 250, 200, 100, 50, 25, 10, 5, 2]]

# Initialize an empty DataFrame to store the filtered results
fdataframe_2 = pd.DataFrame()
fdataframe_3 = pd.DataFrame()

# Define the number of decimal places to round to
decimal_places = 8

# Loop through each value in rps_values and filter the DataFrames
for value in rps_values:
    rounded_value = round(value, decimal_places)
    fdataframe_2 = pd.concat([fdataframe_2, dataframe_2[np.round(dataframe_2['RPs'], decimal_places) == rounded_value]])
    fdataframe_3 = pd.concat([fdataframe_3, dataframe_3[np.round(dataframe_3['RPs'], decimal_places) == rounded_value]])


fdataframe_3.rename(columns={'S_Sum_Loss': 'AEP','TCE_AEP_Final': 'TCE-AEP'}, inplace=True)
fdataframe_2.rename(columns={ 'Max_Loss': 'OEP','TCE_OEP_Final': 'TCE-OEP'}, inplace=True)

# Define the columns to be used in the new DataFrame for fdataframe_3
columns_to_keep_3 = ['RPs']
columns_to_melt_3 = [ 'AEP','TCE-AEP']

# Melt fdataframe_3 to reshape it
melted_df_3 = fdataframe_3.melt(id_vars=columns_to_keep_3, value_vars=columns_to_melt_3, 
                                var_name='EPType', value_name='Loss')

# Rename columns to match the desired output
melted_df_3.rename(columns={'RPs': 'ReturnPeriod'}, inplace=True)

# Reorder columns
final_df_3 = melted_df_3[['EPType', 'Loss', 'ReturnPeriod']]

# Define the columns to be used in the new DataFrame for fdataframe_2
columns_to_keep_2 = ['RPs']
columns_to_melt_2 = [ 'OEP','TCE-OEP']

# Melt fdataframe_2 to reshape it
melted_df_2 = fdataframe_2.melt(id_vars=columns_to_keep_2, value_vars=columns_to_melt_2, 
                                var_name='EPType', value_name='Loss')

# Rename columns to match the desired output
melted_df_2.rename(columns={'RPs': 'ReturnPeriod'}, inplace=True)

# Reorder columns
final_df_2 = melted_df_2[['EPType', 'Loss', 'ReturnPeriod']]

# Concatenate the two DataFrames
final_df_EP_Portfolio_GU = pd.concat([ final_df_2,final_df_3], ignore_index=True)


# Define the new order for EPType
new_ep_type_order = ["OEP", "AEP", "TCE-OEP", "TCE-AEP"]

# Update the EPType column to the new order
final_df_EP_Portfolio_GU['EPType'] = pd.Categorical(final_df_EP_Portfolio_GU['EPType'], categories=new_ep_type_order, ordered=True)

# Sort the DataFrame by EPType and then by ReturnPeriod in descending order within each EPType
final_df_EP_Portfolio_GU = final_df_EP_Portfolio_GU.sort_values(by=['EPType', 'ReturnPeriod'], ascending=[True, False]).reset_index(drop=True)


main_folder_path = os.path.join(output_folder_path, f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_Losses')

# Define the file path for the Parquet file
parquet_file_path = os.path.join(main_folder_path, 'EP', 'Portfolio', 'GU', f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_EP_Portfolio_GU_0.parquet')

# Save final_df as a Parquet file
final_df_EP_Portfolio_GU.to_parquet(parquet_file_path, index=False)

print(f"Parquet file saved successfully at {parquet_file_path}")



Parquet file saved successfully at D:\RISHIN\Rough\ILC2024_EUWS_PLA_WI_EP_BE_EUR_Losses\EP\Portfolio\GU\ILC2024_EUWS_PLA_WI_EP_BE_EUR_EP_Portfolio_GU_0.parquet


In [None]:
#now for stats LOB GU 

In [154]:
# Initialize an empty list to store the aggregated results
aggregated_tables = []

# Process each Parquet file individually
for file in parquet_files:
    # Read the Parquet file into a PyArrow Table
    table = pq.read_table(file)
    
    # Perform the aggregation: sum the Loss column grouped by LobName
    grouped = table.group_by('LobName').aggregate([('Loss', 'sum')])
    
    # Calculate AAL
    loss_sum = grouped.column('Loss_sum').to_numpy()
    aal = loss_sum / speriod / samples
    aal_array = pa.array(aal)
    grouped = grouped.append_column('AAL', aal_array)
    
    # Select only the necessary columns
    grouped = grouped.select(['LobName', 'AAL'])
    
    # Append the grouped Table to the list
    aggregated_tables.append(grouped)

# Concatenate all the grouped Tables
final_table = pa.concat_tables(aggregated_tables)

# Group the final Table again to ensure all groups are combined
final_grouped = final_table.group_by('LobName').aggregate([('AAL', 'sum')])

# Sort the final grouped Table by 'LobName'
final_grouped = final_grouped.sort_by('LobName')

# Convert the final grouped Table to a Pandas DataFrame
final_df = final_grouped.to_pandas()

final_df['LobId'] = final_df['LobName'].map(lobname_to_lobid)

final_df_STATS_Lob = final_df.rename(columns={'AAL_sum': 'AAL'})

# Define the columns with NaN values for 'Std' and 'CV'
final_df_STATS_Lob['Std'] = np.nan
final_df_STATS_Lob['CV'] = np.nan

# Reorder the columns to match the specified format
final_df_STATS_Lob = final_df_STATS_Lob[['AAL', 'Std', 'CV', 'LobId', 'LobName']]




In [158]:
main_folder_path = os.path.join(output_folder_path, f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_Losses')

# Define the file path for the Parquet file
parquet_file_path = os.path.join(main_folder_path, 'STATS', 'Lob', 'GU', f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_STATS_Lob_GU_0.parquet')
final_df_STATS_Lob.to_parquet(parquet_file_path, index=False)
print(f"Parquet file saved successfully at {parquet_file_path}")

Parquet file saved successfully at D:\RISHIN\Rough\ILC2024_EUWS_PLA_WI_EP_BE_EUR_Losses\STATS\Lob\GU\ILC2024_EUWS_PLA_WI_EP_BE_EUR_STATS_Lob_GU_0.parquet


In [None]:
#now for STATS Portfolio GU

In [9]:

aggregated_tables = []

# Process each Parquet file individually
for file in parquet_files:
    # Read the Parquet file into a PyArrow Table
    table = pq.read_table(file)
    
    # Perform the aggregation: sum the Loss column grouped by LobName
    grouped = table.group_by('LobName').aggregate([('Loss', 'sum')])
    
    # Calculate AAL
    loss_sum = grouped.column('Loss_sum').to_numpy()
    aal = loss_sum / speriod / samples
    aal_array = pa.array(aal)
    grouped = grouped.append_column('AAL', aal_array)
    
    # Select only the necessary columns
    grouped = grouped.select(['LobName', 'AAL'])
    
    # Append the grouped Table to the list
    aggregated_tables.append(grouped)

# Concatenate all the grouped Tables
final_table = pa.concat_tables(aggregated_tables)

# Convert the final table to a Pandas DataFrame
final_df = final_table.to_pandas()

# Sum all the AAL values without grouping by LobName
total_aal = final_df['AAL'].sum()

# Create a DataFrame with the specified columns
final_df_STATS_Portfolio = pd.DataFrame({
    'AAL': [total_aal],
    'Std': [np.nan],
    'CV': [np.nan],
})


In [14]:
main_folder_path = os.path.join(output_folder_path, f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_Losses')

# Define the file path for the Parquet file
parquet_file_path = os.path.join(main_folder_path, 'STATS', 'Portfolio', 'GU', f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_STATS_Portfolio_GU_0.parquet')
final_df_STATS_Portfolio.to_parquet(parquet_file_path, index=False)
print(f"Parquet file saved successfully at {parquet_file_path}")

Parquet file saved successfully at D:\RISHIN\Rough\ILC2024_EUWS_PLA_WI_EP_BE_EUR_Losses\STATS\Portfolio\GU\ILC2024_EUWS_PLA_WI_EP_BE_EUR_STATS_Portfolio_GU_0.parquet


In [None]:
#PLT GU Lob

In [8]:
# Directory to store intermediate results
intermediate_dir = os.path.join(main_folder_path, 'PLT', 'Lob', 'GU', f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_PLT_Lob_GU')
os.makedirs(intermediate_dir, exist_ok=True)

group_by_columns = ['PeriodId', 'EventId', 'EventDate', 'LossDate', 'Region', 'Peril', 'Weight', 'LobId', 'LobName']

# Process each Parquet file in chunks and write intermediate results to disk
for i, file in enumerate(parquet_files):
    parquet_file = pq.ParquetFile(file)
    for j, batch in enumerate(parquet_file.iter_batches()):
        table = pa.Table.from_batches([batch])
        grouped_table = table.group_by(group_by_columns).aggregate([('Loss', 'sum')])
        intermediate_file = os.path.join(intermediate_dir, f"intermediate_{i}_{j}.parquet")
        pq.write_table(grouped_table, intermediate_file)

# Read intermediate results and combine them
intermediate_files = [os.path.join(intermediate_dir, f) for f in os.listdir(intermediate_dir) if f.endswith('.parquet')]
intermediate_tables = [pq.read_table(file) for file in intermediate_files]
combined_grouped_table = pa.concat_tables(intermediate_tables)





In [21]:
print(combined_grouped_table.schema)

PeriodId: int64 not null
EventId: int64 not null
EventDate: timestamp[ns] not null
LossDate: timestamp[ns] not null
Region: string
Peril: string
Weight: double not null
LobId: int64 not null
LobName: string
Loss_sum: double


In [22]:
# Directory to store intermediate results
intermediate_dir = os.path.join(main_folder_path, 'PLT', 'Lob', 'GU', f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_PLT_Lob_GU')
os.makedirs(intermediate_dir, exist_ok=True)

group_by_columns = ['PeriodId', 'EventId', 'EventDate', 'LossDate', 'Region', 'Peril', 'Weight', 'LobId', 'LobName']

# Process each Parquet file in chunks and write intermediate results to disk
for i, file in enumerate(parquet_files):
    parquet_file = pq.ParquetFile(file)
    for j, batch in enumerate(parquet_file.iter_batches()):
        table = pa.Table.from_batches([batch])
        grouped_table = table.group_by(group_by_columns).aggregate([('Loss', 'sum')])
        intermediate_file = os.path.join(intermediate_dir, f"intermediate_{i}_{j}.parquet")
        pq.write_table(grouped_table, intermediate_file)

# Read intermediate results and combine them
intermediate_files = [os.path.join(intermediate_dir, f) for f in os.listdir(intermediate_dir) if f.endswith('.parquet')]
intermediate_tables = [pq.read_table(file) for file in intermediate_files]
combined_grouped_table = pa.concat_tables(intermediate_tables)





In [16]:
final_grouped_table = combined_grouped_table.group_by(group_by_columns).aggregate([('Loss_sum', 'sum')])


In [18]:
# Perform the final group by and aggregation
final_grouped_table = combined_grouped_table.group_by(group_by_columns).aggregate([('Loss_sum', 'sum')])
final_grouped_table = final_grouped_table.sort_by([('Loss_sum_sum', 'descending')])


# Rename the aggregated column
final_grouped_table = final_grouped_table.rename_columns(group_by_columns + ['Loss'])


# Convert the result to a Pandas DataFrame
df_grouped = final_grouped_table.to_pandas()

# Delete intermediate files
for file in intermediate_files:
    try:
        os.remove(file)
    except FileNotFoundError:
        print(f"File not found: {file}")

# Remove the intermediate directory
try:
    os.rmdir(intermediate_dir)
except FileNotFoundError:
    print(f"Directory not found: {intermediate_dir}")
except OSError:
    print(f"Directory not empty or other error: {intermediate_dir}")

File not found: D:\RISHIN\Rough\ILC2024_EUWS_PLA_WI_EP_BE_EUR_Losses\PLT\Lob\GU\ILC2024_EUWS_PLA_WI_EP_BE_EUR_PLT_Lob_GU\intermediate_0_0.parquet
File not found: D:\RISHIN\Rough\ILC2024_EUWS_PLA_WI_EP_BE_EUR_Losses\PLT\Lob\GU\ILC2024_EUWS_PLA_WI_EP_BE_EUR_PLT_Lob_GU\intermediate_0_1.parquet
File not found: D:\RISHIN\Rough\ILC2024_EUWS_PLA_WI_EP_BE_EUR_Losses\PLT\Lob\GU\ILC2024_EUWS_PLA_WI_EP_BE_EUR_PLT_Lob_GU\intermediate_0_10.parquet
File not found: D:\RISHIN\Rough\ILC2024_EUWS_PLA_WI_EP_BE_EUR_Losses\PLT\Lob\GU\ILC2024_EUWS_PLA_WI_EP_BE_EUR_PLT_Lob_GU\intermediate_0_100.parquet
File not found: D:\RISHIN\Rough\ILC2024_EUWS_PLA_WI_EP_BE_EUR_Losses\PLT\Lob\GU\ILC2024_EUWS_PLA_WI_EP_BE_EUR_PLT_Lob_GU\intermediate_0_101.parquet
File not found: D:\RISHIN\Rough\ILC2024_EUWS_PLA_WI_EP_BE_EUR_Losses\PLT\Lob\GU\ILC2024_EUWS_PLA_WI_EP_BE_EUR_PLT_Lob_GU\intermediate_0_102.parquet
File not found: D:\RISHIN\Rough\ILC2024_EUWS_PLA_WI_EP_BE_EUR_Losses\PLT\Lob\GU\ILC2024_EUWS_PLA_WI_EP_BE_EUR_PLT_Lob

In [19]:
main_folder_path = os.path.join(output_folder_path, f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_Losses')

# Define the file path for the Parquet file
parquet_file_path = os.path.join(main_folder_path, 'PLT', 'Lob', 'GU', f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_PLT_Lob_GU_0.parquet')

# Reorder the columns in the desired order
ordered_columns = ['PeriodId', 'EventId', 'EventDate', 'LossDate', 'Loss', 'Region', 'Peril', 'Weight', 'LobId', 'LobName']

df_grouped = df_grouped[ordered_columns]
df_grouped.to_parquet(parquet_file_path, index=False)
print(f"Parquet file saved successfully at {parquet_file_path}")

Parquet file saved successfully at D:\RISHIN\Rough\ILC2024_EUWS_PLA_WI_EP_BE_EUR_Losses\PLT\Lob\GU\ILC2024_EUWS_PLA_WI_EP_BE_EUR_PLT_Lob_GU_0.parquet


In [12]:
df_grouped

Unnamed: 0,PeriodId,EventId,EventDate,LossDate,Loss,Region,Peril,Weight,LobId,LobName
762942,207557,53895970,2020-05-30,2020-05-30,2.250859e+10,EU,WS,0.000004,5,SPER
2732636,149148,53891598,2020-05-13,2020-05-13,2.197889e+10,EU,WS,0.000004,5,SPER
3072717,4312,53893301,2020-11-10,2020-11-10,2.160561e+10,EU,WS,0.000004,5,SPER
5920325,154312,53893301,2020-11-10,2020-11-10,2.092983e+10,EU,WS,0.000004,5,SPER
5640539,199148,53891598,2020-05-13,2020-05-13,1.993543e+10,EU,WS,0.000004,5,SPER
...,...,...,...,...,...,...,...,...,...,...
2690504,165828,53876702,2020-10-21,2020-10-21,1.870107e-03,EU,WS,0.000004,2,AUTO
5869052,130081,53865125,2020-12-14,2020-12-14,1.669423e-03,EU,WS,0.000004,1,AGR
3477318,82560,53881107,2020-10-25,2020-10-25,8.474226e-04,EU,WS,0.000004,2,AUTO
84398,212031,53872002,2020-12-16,2020-12-16,6.278728e-04,EU,WS,0.000004,1,AGR


In [None]:
#PLT Portfolio GU

In [1]:
country="BE"

In [7]:
# Flush memory at the beginning
main_folder_path = os.path.join(output_folder_path, f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_Losses')
gc.collect()

# Directory to store intermediate results
intermediate_dir = os.path.join(main_folder_path, 'PLT', 'Portfolio', 'GU', f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_PLT_Portfolio_G.parquet')
os.makedirs(intermediate_dir, exist_ok=True)

group_by_columns = ["PeriodId", "EventId", "EventDate", "LossDate", "Region", "Peril", "Weight"]

# Process each Parquet file in chunks and write intermediate results to disk
for i, file in enumerate(parquet_files):
    parquet_file = pq.ParquetFile(file)
    for j, batch in enumerate(parquet_file.iter_batches()):
        table = pa.Table.from_batches([batch])
        grouped_table = table.group_by(group_by_columns).aggregate([('Loss', 'sum')])
        intermediate_file = os.path.join(intermediate_dir, f"intermediate_{i}_{j}.parquet")
        pq.write_table(grouped_table, intermediate_file)

# Read intermediate results and combine them
intermediate_files = [os.path.join(intermediate_dir, f) for f in os.listdir(intermediate_dir) if f.endswith('.parquet')]
intermediate_tables = [pq.read_table(file) for file in intermediate_files]
combined_grouped_table = pa.concat_tables(intermediate_tables)

# Perform the final group by and aggregation
final_grouped_table = combined_grouped_table.group_by(group_by_columns).aggregate([('Loss_sum', 'sum')])

# Rename the aggregated column
final_grouped_table = final_grouped_table.rename_columns(group_by_columns + ['Loss'])

# Convert the result to a Pandas DataFrame
df_grouped = final_grouped_table.to_pandas()

# Delete intermediate files
for file in intermediate_files:
    try:
        os.remove(file)
    except FileNotFoundError:
        print(f"File not found: {file}")

# Remove the intermediate directory
try:
    os.rmdir(intermediate_dir)
except FileNotFoundError:
    print(f"Directory not found: {intermediate_dir}")
except OSError:
    print(f"Directory not empty or other error: {intermediate_dir}")

main_folder_path = os.path.join(output_folder_path, f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_Losses')

# Define the file path for the Parquet file
parquet_file_path = os.path.join(main_folder_path, 'PLT', 'Portfolio', 'GU', f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_PLT_Portfolio_GU_0.parquet')

# Reorder the columns in the desired order
ordered_columns = ["PeriodId", "EventId", "EventDate", "LossDate","Loss", "Region", "Peril", "Weight"]

df_grouped = df_grouped[ordered_columns]
df_grouped.to_parquet(parquet_file_path, index=False)
print(f"Parquet file saved successfully at {parquet_file_path}")



Parquet file saved successfully at D:\RISHIN\Rough\ILC2024_EUWS_PLA_WI_EP_BE_EUR_Losses\PLT\Portfolio\GU\ILC2024_EUWS_PLA_WI_EP_BE_EUR_PLT_Portfolio_GU_0.parquet


In [30]:
import pandas as pd
import pyarrow.parquet as pq

# Prompt the user to enter the path to the Parquet file
file_path = input("Enter the file path: ")

# Ensure the file path is treated as a raw string and strip any extra quotes
file_path = r"{}".format(file_path.strip('"'))

# Read the Parquet file into a PyArrow Table
table = pq.read_table(file_path)

# Convert the PyArrow Table to a Pandas DataFrame
df = table.to_pandas()



In [31]:
df

Unnamed: 0,PeriodId,EventId,EventDate,LossDate,Loss,Region,Peril,Weight,LobId,LobName
0,5,53863678,2020-01-05,2020-01-05,288927.068181,EU,WS,0.000004,1,AGR
1,8,53876196,2020-01-27,2020-01-27,296034.741904,EU,WS,0.000004,1,AGR
2,5,53863678,2020-01-05,2020-01-05,401156.376110,EU,WS,0.000004,2,AUTO
3,8,53876196,2020-01-27,2020-01-27,80131.857004,EU,WS,0.000004,2,AUTO
4,3,53878990,2020-02-25,2020-02-25,2974.012492,EU,WS,0.000004,5,SPER
...,...,...,...,...,...,...,...,...,...,...
7381503,66921,53857722,2020-03-26,2020-03-26,42326.437487,EU,WS,0.000004,1,AGR
7381504,66921,53857722,2020-03-26,2020-03-26,37437.997067,EU,WS,0.000004,3,COM
7381505,66921,53857722,2020-03-26,2020-03-26,307822.045343,EU,WS,0.000004,4,IND
7381506,66909,53855784,2020-08-26,2020-08-26,641923.444573,EU,WS,0.000004,4,IND


In [28]:
import pandas as pd
import pyarrow.parquet as pq

# Prompt the user to enter the path to the Parquet file
file_path2 = input("Enter the file path: ")

# Ensure the file path is treated as a raw string and strip any extra quotes
file_path2= r"{}".format(file_path.strip('"'))

# Read the Parquet file into a PyArrow Table
table2 = pq.read_table(file_path)

# Convert the PyArrow Table to a Pandas DataFrame
df2 = table.to_pandas()



In [1]:
import pandas as pd
import pyarrow.parquet as pq

# Prompt the user to enter the path to the Parquet file
file_path = input("Enter the file path: ")

# Ensure the file path is treated as a raw string and strip any extra quotes
file_path = r"{}".format(file_path.strip('"'))

# Open the Parquet file
parquet_file = pq.ParquetFile(file_path)

# Define the batch size
batch_size = 10000  # Adjust the batch size as needed

# Iterate over the file in batches
for batch in parquet_file.iter_batches(batch_size):
    # Convert the batch to a Pandas DataFrame
    df_batch = batch.to_pandas()
    
    # Print the header of the batch
    print(df_batch.head())
    
    # Optionally, process the batch here
    # ...

    # Break after the first batch if you only want to print the header of the first batch
    break

   PeriodId   EventId  EventDate   LossDate       Loss Region Peril    Weight  \
0      6257  53858484 2020-01-14 2020-01-14   0.979822     EU    WS  0.000004   
1      6257  53858484 2020-01-14 2020-01-14   0.000871     EU    WS  0.000004   
2      6257  53858484 2020-01-14 2020-01-14  11.855758     EU    WS  0.000004   
3      6257  53858484 2020-01-14 2020-01-14   8.406023     EU    WS  0.000004   
4      6257  53858484 2020-01-14 2020-01-14   0.043087     EU    WS  0.000004   

    Loss_sum  
0   0.979822  
1   0.000871  
2  11.855758  
3   8.406023  
4   0.043087  


In [66]:
# Display the schema (data types) of the DataFrame
print("Schema (data types):")
print(df.dtypes)

# Display the columns of the DataFrame
print("\nColumns:")
print(df.columns)

# Display the number of decimal places for floating-point numbers
print("\nDecimal places for floating-point numbers:")
for col in df.select_dtypes(include=['float64']).columns:
    decimal_places = df[col].apply(lambda x: len(str(x).split('.')[1]) if '.' in str(x) else 0).max()
    print(f"{col}: {decimal_places} decimal places")

# Display the first few rows of the DataFrame
print("\nFirst few rows of the DataFrame:")
print(df.head())

Schema (data types):
EPType           object
Loss            float64
ReturnPeriod    float64
LobId            object
LobName          object
dtype: object

Columns:
Index(['EPType', 'Loss', 'ReturnPeriod', 'LobId', 'LobName'], dtype='object')

Decimal places for floating-point numbers:
Loss: 10 decimal places
ReturnPeriod: 1 decimal places

First few rows of the DataFrame:
  EPType          Loss  ReturnPeriod LobId LobName
0    OEP  8.639885e+08       10000.0     2    AUTO
1    OEP  6.749918e+08        5000.0     2    AUTO
2    OEP  3.778001e+08        1000.0     2    AUTO
3    OEP  2.847455e+08         500.0     2    AUTO
4    OEP  2.064693e+08         250.0     2    AUTO
