In [1]:
import dask.dataframe as dd
from dask.distributed import Client
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [10]:
LobNamevar=input("Enter the LOBNAME")
speriod=int(input("Enter the simulation period: "))
samples=int(input("Enter the number of samples: "))

In [28]:


# Define the folder containing the Parquet files
folder_path = r'D:\RISHIN\13_ILC_TASK1\input\PARQUET_FILES'

# List all Parquet files in the folder
parquet_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.parquet')]

# Initialize an empty DataFrame to store the results
final_grouped_df = pd.DataFrame()

# Process each Parquet file individually
for file in parquet_files:
    # Read the Parquet file into a PyArrow Table
    table = pq.read_table(file)
    
    # Convert the PyArrow Table to a Pandas DataFrame
    df = table.to_pandas()
    
    # Filter the DataFrame where LobName equals LobNamevar
    filtered_df = df[df['LobName'] == LobNamevar]
    
    # Perform the aggregation: sum the Loss column grouped by EventId and PeriodId
    grouped_df = filtered_df.groupby(['EventId', 'PeriodId'], as_index=False).agg({'Loss': 'sum'})
    
    # Rename the aggregated column to Sum_Loss
    grouped_df.rename(columns={'Loss': 'Sum_Loss'}, inplace=True)
    
    # Append the grouped DataFrame to the final_grouped_df
    final_grouped_df = pd.concat([final_grouped_df, grouped_df], ignore_index=True)

# Perform final grouping and sorting
final_grouped_df = final_grouped_df.groupby(['EventId', 'PeriodId'], as_index=False).agg({'Sum_Loss': 'sum'})
final_grouped_df = final_grouped_df.sort_values(by=[ 'Sum_Loss'], ascending=[ False])

# The DataFrame is now ready for the next instructions
dataframe_1 = final_grouped_df

In [29]:
# Initialize dataframe_2 by selecting PeriodId and max(Sum_Loss) grouped by PeriodId
dataframe_2 = dataframe_1.groupby('PeriodId', as_index=False).agg({'Sum_Loss': 'max'})

# Rename the aggregated column to Max_Loss
dataframe_2.rename(columns={'Sum_Loss': 'Max_Loss'}, inplace=True)

# Sort dataframe_2 by Max_Loss in descending order
dataframe_2 = dataframe_2.sort_values(by='Max_Loss', ascending=False)



In [30]:
# Initialize dataframe_2 by selecting PeriodId and Sum(Sum_Loss) grouped by PeriodId
dataframe_3 = dataframe_1.groupby('PeriodId', as_index=False).agg({'Sum_Loss': 'sum'})

# Rename the aggregated column to Sum_Loss
dataframe_3.rename(columns={'Sum_Loss': 'S_Sum_Loss'}, inplace=True)

# Sort dataframe_3 by S_sum_Loss in descending order
dataframe_3 = dataframe_3.sort_values(by='S_Sum_Loss', ascending=False)



In [95]:
# Calculate the rate column and round to 6 decimal places
dataframe_2['rate'] = round(1 / (speriod * samples), 6)

# Calculate the cumulative rate column and round to 6 decimal places
dataframe_2['cumrate'] = dataframe_2['rate'].cumsum().round(6)

# Calculate the RPs column and round to 6 decimal places
dataframe_2['RPs'] = (1 / dataframe_2['cumrate']).round(7)

# Calculate the TCE_OEP_1 column and round to 6 decimal places
dataframe_2['TCE_OEP_1'] = ((dataframe_2['Max_Loss'] - dataframe_2['Max_Loss'].shift(-1)) * 
                          (dataframe_2['cumrate'] + dataframe_2['cumrate'].shift(-1)) * 0.5).round(6)

# Calculate the TCE_OEP_2 column and round to 6 decimal places
dataframe_2['TCE_OEP_2'] = (dataframe_2['TCE_OEP_1'].cumsum() * dataframe_2['RPs']).round(6)

# Calculate the TCE_OEP_Final column and round to 6 decimal places
dataframe_2['TCE_OEP_Final'] = (dataframe_2['TCE_OEP_2'] + dataframe_2['Max_Loss']).round(6)

In [90]:
# Calculate the rate column and round to 6 decimal places
dataframe_3['rate'] = round(1 / (speriod * samples), 6)

# Calculate the cumulative rate column and round to 6 decimal places
dataframe_3['cumrate'] = dataframe_3['rate'].cumsum().round(6)

# Calculate the RPs column and round to 6 decimal places
dataframe_3['RPs'] = (1 / dataframe_3['cumrate']).round(7)
# Calculate the TCE_AEP_1 column and round to 6 decimal places
dataframe_3['TCE_AEP_1'] = ((dataframe_3['S_Sum_Loss'] - dataframe_3['S_Sum_Loss'].shift(-1)) * 
                          (dataframe_3['cumrate'] + dataframe_3['cumrate'].shift(-1)) * 0.5).round(2)

# Calculate the TCE_AEP_2 column and round to 6 decimal places
dataframe_3['TCE_AEP_2'] = (dataframe_3['TCE_AEP_1'].cumsum() * dataframe_3['RPs']).round(2)

# Calculate the TCE_AEP_Final column and round to 6 decimal places
dataframe_3['TCE_AEP_Final'] = (dataframe_3['TCE_AEP_2'] + dataframe_3['S_Sum_Loss']).round(2)

In [91]:
dataframe_3.head(20)

Unnamed: 0,PeriodId,S_Sum_Loss,rate,cumrate,RPs,TCE_AEP_1,TCE_AEP_2,TCE_AEP_Final
147788,149148,1660418000.0,4e-06,4e-06,250000.0,848.36,212090000.0,1872508000.0
197324,199148,1519025000.0,4e-06,8e-06,125000.0,1590.6,304870000.0,1823895000.0
205656,207557,1359965000.0,4e-06,1.2e-05,83333.333333,110.57,212460800.0,1572426000.0
4264,4312,1352067000.0,4e-06,1.6e-05,62500.0,1255.64,237823100.0,1589890000.0
152903,154312,1282309000.0,4e-06,2e-05,50000.0,1378.47,259182000.0,1541491000.0
103347,104312,1219651000.0,4e-06,2.4e-05,41666.666667,735.16,246616700.0,1466268000.0
191303,193067,1191376000.0,4e-06,2.8e-05,35714.285714,922.11,244318200.0,1435694000.0
236882,239080,1160639000.0,4e-06,3.2e-05,31250.0,139.7,218144100.0,1378783000.0
48693,49148,1156530000.0,4e-06,3.6e-05,27777.777778,477.2,207161400.0,1363692000.0
98237,99148,1143972000.0,4e-06,4e-05,25000.0,325.17,194574500.0,1338547000.0


In [92]:
dataframe_3[dataframe_3['PeriodId'] == 76654]

Unnamed: 0,PeriodId,S_Sum_Loss,rate,cumrate,RPs,TCE_AEP_1,TCE_AEP_2,TCE_AEP_Final
75941,76654,286667500.0,4e-06,0.00252,396.825397,158.08,149752900.0,436420500.0


In [94]:
dataframe_3[dataframe_3['PeriodId'] == 63817]

Unnamed: 0,PeriodId,S_Sum_Loss,rate,cumrate,RPs,TCE_AEP_1,TCE_AEP_2,TCE_AEP_Final
63238,63817,404257100.0,4e-06,0.001,1000.0,548.67,185787700.0,590044800.0


In [None]:
similary i want an another script..so in first case is 1) select PeriodId,EventId,sum(Loss)Sum_Loss where LobName == LobNamevar grouped by PeriodId,EventId ordered by sum_loss DESC
2) select PeriodId,max(Loss)Max_Loss grouped by PeriodId  ordered by Max_Loss DESC from the result of first case 

I will give the third case later