In [1]:
import dask.dataframe as dd
from dask.distributed import Client
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import pyarrow.compute as pc


In [2]:
speriod=int(input("Enter the simulation period: "))
samples=int(input("Enter the number of samples: "))

In [3]:
# Define the folder containing the Parquet files
folder_path = r'D:\RISHIN\13_ILC_TASK1\input\PARQUET_FILES'

# List all Parquet files in the folder
parquet_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.parquet')]

In [5]:

# Check if there are any Parquet files in the folder
if parquet_files:
    # Read the first Parquet file in chunks
    parquet_file = pq.ParquetFile(parquet_files[0])
    for batch in parquet_file.iter_batches(batch_size=1000):
        # Convert the first batch to a PyArrow Table
        table = pa.Table.from_batches([batch])
        
        # Convert the PyArrow Table to a Pandas DataFrame
        df = table.to_pandas()
        
        # Extract the first value of LocationName and split it by '_'
        location_name = df['LocationName'].iloc[0]
        country = location_name.split('_')[0]
        
        # Ask user for output folder
        output_folder_path = input("Enter the output folder path: ")
        
        # Define the main folder path
        main_folder_path = os.path.join(output_folder_path, f'ILC2024_EUWS_PLA_WI_EP_{country}_EUR_Losses')
        
        # Define subfolders
        subfolders = ['EP', 'PLT', 'STATS']
        nested_folders = ['Lob', 'Portfolio']
        innermost_folders = ['GR', 'GU']
        
        # Create the main folder and subfolders
        for subfolder in subfolders:
            subfolder_path = os.path.join(main_folder_path, subfolder)
            os.makedirs(subfolder_path, exist_ok=True)
            
            for nested_folder in nested_folders:
                nested_folder_path = os.path.join(subfolder_path, nested_folder)
                os.makedirs(nested_folder_path, exist_ok=True)
                
                for innermost_folder in innermost_folders:
                    innermost_folder_path = os.path.join(nested_folder_path, innermost_folder)
                    os.makedirs(innermost_folder_path, exist_ok=True)
        
        print(f"Folders created successfully at {main_folder_path}")
        break  # Process only the first batch
else:
    print("No Parquet files found in the specified folder.")

Folders created successfully at D:\RISHIN\Rough\ILC2024_EUWS_PLA_WI_EP_BE_EUR_Losses


In [None]:
#lob name 

In [4]:


# Initialize an empty list to store the results
final_grouped_tables = []

# Process each Parquet file individually
for file in parquet_files:
    # Read the Parquet file into a PyArrow Table
    table = pq.read_table(file)
    
    # Perform the aggregation: sum the Loss column grouped by EventId, PeriodId, and LobName
    grouped_table = table.group_by(['EventId', 'PeriodId', 'LobName']).aggregate([('Loss', 'sum')])
    
    # Rename the aggregated column to Sum_Loss
    grouped_table = grouped_table.rename_columns(['EventId', 'PeriodId', 'LobName', 'Sum_Loss'])
    
    # Append the grouped Table to the final_grouped_tables list
    final_grouped_tables.append(grouped_table)

# Concatenate all grouped tables
final_table = pa.concat_tables(final_grouped_tables)

# Perform final grouping and sorting
final_grouped_table = final_table.group_by(['EventId', 'PeriodId', 'LobName']).aggregate([('Sum_Loss', 'sum')])
sorted_final_table = final_grouped_table.sort_by([('Sum_Loss_sum', 'descending')])
# The Table is now ready for the next instructions
dataframe_1 = sorted_final_table

In [5]:
dataframe_1= dataframe_1.to_pandas()


In [11]:
dataframe_1 = dataframe_1[dataframe_1['LobName'] == 'AUTO']


In [12]:
# Initialize dataframe_2 by selecting PeriodId and max(Sum_Loss) grouped by PeriodId
dataframe_2 = dataframe_1.groupby(['PeriodId','LobName'], as_index=False).agg({'Sum_Loss_sum': 'max'})

# Rename the aggregated column to Max_Loss
dataframe_2.rename(columns={'Sum_Loss_sum': 'Max_Loss'}, inplace=True)

# Sort dataframe_2 by Max_Loss in descending order
dataframe_2 = dataframe_2.sort_values(by='Max_Loss', ascending=False)

# Initialize dataframe_2 by selecting PeriodId and Sum(Sum_Loss) grouped by PeriodId
dataframe_3 = dataframe_1.groupby(['PeriodId','LobName'], as_index=False).agg({'Sum_Loss_sum': 'sum'})

# Rename the aggregated column to Sum_Loss
dataframe_3.rename(columns={'Sum_Loss_sum': 'S_Sum_Loss'}, inplace=True)

# Sort dataframe_3 by S_sum_Loss in descending order
dataframe_3 = dataframe_3.sort_values(by='S_Sum_Loss', ascending=False)

#dataframe_2['Max_Loss'] = dataframe_2['Max_Loss'].round(5)

dataframe_2['rate'] = (1 / (speriod * samples))

# Calculate the cumulative rate column and round to 6 decimal places
dataframe_2['cumrate'] = dataframe_2['rate'].cumsum()

# Calculate the RPs column and round to 6 decimal places
dataframe_2['RPs'] = (1 / dataframe_2['cumrate'])

# Calculate the TCE_OEP_1 column and round to 6 decimal places
dataframe_2['TCE_OEP_1'] = ((dataframe_2['Max_Loss'] - dataframe_2['Max_Loss'].shift(-1)) * 
                          (dataframe_2['cumrate'] + dataframe_2['cumrate'].shift(-1)) * 0.5)

# Calculate the TCE_OEP_2 column and round to 6 decimal places
dataframe_2['TCE_OEP_2'] = (dataframe_2['TCE_OEP_1'].shift().cumsum() * dataframe_2['RPs']).round(6)


# Calculate the TCE_OEP_Final column and round to 6 decimal places
dataframe_2['TCE_OEP_Final'] = (dataframe_2['TCE_OEP_2'] + dataframe_2['Max_Loss'])

#dataframe_3['S_Sum_Loss'] = dataframe_3['S_Sum_Loss'].round(5)

# Calculate the rate column and round to 6 decimal places
dataframe_3['rate'] = (1 / (speriod * samples))

# Calculate the cumulative rate column and round to 6 decimal places
dataframe_3['cumrate'] = dataframe_3['rate'].cumsum()

# Calculate the RPs column and round to 6 decimal places
dataframe_3['RPs'] = (1 / dataframe_3['cumrate'])
# Calculate the TCE_AEP_1 column and round to 6 decimal places
dataframe_3['TCE_AEP_1'] = ((dataframe_3['S_Sum_Loss'] - dataframe_3['S_Sum_Loss'].shift(-1)) * 
                          (dataframe_3['cumrate'] + dataframe_3['cumrate'].shift(-1)) * 0.5)

# Calculate the cumulative sum up to the previous row and multiply by the current row's RPs, then round to 6 decimal places
dataframe_3['TCE_AEP_2'] = (dataframe_3['TCE_AEP_1'].shift().cumsum() * dataframe_3['RPs']).round(6)

# Calculate the TCE_AEP_Final column and round to 6 decimal places
dataframe_3['TCE_AEP_Final'] = (dataframe_3['TCE_AEP_2'] + dataframe_3['S_Sum_Loss'])





In [13]:
dataframe_2

Unnamed: 0,PeriodId,LobName,Max_Loss,rate,cumrate,RPs,TCE_OEP_1,TCE_OEP_2,TCE_OEP_Final
147788,149148,AUTO,1.654085e+09,0.000004,0.000004,250000.000000,869.991566,,
197324,199148,AUTO,1.509087e+09,0.000004,0.000008,125000.000000,1496.296836,1.087489e+08,1.617836e+09
205656,207557,AUTO,1.359457e+09,0.000004,0.000012,83333.333333,559.156488,1.971907e+08,1.556648e+09
4264,4312,AUTO,1.319517e+09,0.000004,0.000016,62500.000000,1194.484886,1.828403e+08,1.502357e+09
152903,154312,AUTO,1.253157e+09,0.000004,0.000020,50000.000000,1155.475354,2.059965e+08,1.459153e+09
...,...,...,...,...,...,...,...,...,...
39586,39961,AUTO,9.574042e-01,0.000004,0.990836,1.009249,0.769077,9.833605e+06,9.833606e+06
161461,162941,AUTO,1.812157e-01,0.000004,0.990840,1.009245,0.003417,9.833566e+06,9.833566e+06
239069,241283,AUTO,1.777667e-01,0.000004,0.990844,1.009241,0.004063,9.833526e+06,9.833527e+06
70349,71005,AUTO,1.736658e-01,0.000004,0.990848,1.009237,0.046673,9.833487e+06,9.833487e+06


In [14]:
dataframe_2

Unnamed: 0,PeriodId,LobName,Max_Loss,rate,cumrate,RPs,TCE_OEP_1,TCE_OEP_2,TCE_OEP_Final
147788,149148,AUTO,1.654085e+09,0.000004,0.000004,250000.000000,869.991566,,
197324,199148,AUTO,1.509087e+09,0.000004,0.000008,125000.000000,1496.296836,1.087489e+08,1.617836e+09
205656,207557,AUTO,1.359457e+09,0.000004,0.000012,83333.333333,559.156488,1.971907e+08,1.556648e+09
4264,4312,AUTO,1.319517e+09,0.000004,0.000016,62500.000000,1194.484886,1.828403e+08,1.502357e+09
152903,154312,AUTO,1.253157e+09,0.000004,0.000020,50000.000000,1155.475354,2.059965e+08,1.459153e+09
...,...,...,...,...,...,...,...,...,...
39586,39961,AUTO,9.574042e-01,0.000004,0.990836,1.009249,0.769077,9.833605e+06,9.833606e+06
161461,162941,AUTO,1.812157e-01,0.000004,0.990840,1.009245,0.003417,9.833566e+06,9.833566e+06
239069,241283,AUTO,1.777667e-01,0.000004,0.990844,1.009241,0.004063,9.833526e+06,9.833527e+06
70349,71005,AUTO,1.736658e-01,0.000004,0.990848,1.009237,0.046673,9.833487e+06,9.833487e+06


In [19]:
dataframe_2[dataframe_2['PeriodId'] ==99346]

Unnamed: 0,PeriodId,LobName,Max_Loss,rate,cumrate,RPs,TCE_OEP_1,TCE_OEP_2,TCE_OEP_Final
98434,99346,AUTO,377800100.0,4e-06,0.001,1000.0,453.194274,185139500.0,562939600.0


In [1]:
dataframe_3

NameError: name 'dataframe_3' is not defined

In [5]:
import pandas as pd
import pyarrow.parquet as pq

# Define the path to the Parquet file
file_path = r'D:\RISHIN\13_ILC_TASK1\input\PARQUET_FILES\PLT_0_100.parquet'

# Read the Parquet file into a PyArrow Table
table = pq.read_table(file_path)

# Convert the PyArrow Table to a Pandas DataFrame
df = table.to_pandas()

# Display the first few rows of the DataFrame
print(df.head())

   LocationId LocationName LobName  LobId  PeriodId   EventId    Weight  \
0      883578  BE_AGR_7133     AGR      1         5  53863678  0.000004   
1      883529  BE_AGR_4140     AGR      1         8  53876196  0.000004   
2      883583  BE_AGR_7533     AGR      1         5  53863678  0.000004   
3      911623  BE_AGR_7534     AGR      1         5  53863678  0.000004   
4      943056  BE_AGR_7910     AGR      1         5  53863678  0.000004   

   EventDate   LossDate Region Peril          Loss  
0 2020-01-05 2020-01-05     EU    WS    560.115551  
1 2020-01-27 2020-01-27     EU    WS  18809.439807  
2 2020-01-05 2020-01-05     EU    WS    792.873550  
3 2020-01-05 2020-01-05     EU    WS   7236.296823  
4 2020-01-05 2020-01-05     EU    WS    102.358413  
