In [13]:
import numpy as np
import pandas as pd
import csv
import glob

import matplotlib.pyplot as plt

from scipy.stats import kstest
from scipy.stats import normaltest
from scipy.stats import norm

from itertools import combinations
import pyvinecopulib as pv

import seaborn as sns
from scipy import stats

In [14]:
# Assuming you have three different CSV files with file paths
file1 = r'CompassPaper/HKNW.csv'
file2 = r'CompassPaper/FACT.csv'
file3 = r'CompassPaper/HRYR.csv'

# Read the three CSV files into separate dataframes
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)
df3 = pd.read_csv(file3)

# Get the column names
NBO_dat = df1.dropna()
CT_dat = df2.dropna()
KGL_dat = df3.dropna()

NBO_dat = NBO_dat.drop(NBO_dat.columns[0], axis=1)
CT_dat = CT_dat.drop(CT_dat.columns[0], axis=1)
KGL_dat = KGL_dat.drop(KGL_dat.columns[0], axis=1)


In [15]:
KGL_dat_tmx = KGL_dat[['t_max_24']]
NBO_dat_tmx = NBO_dat[['t_max_24']]
CT_dat_tmx = CT_dat[['t_max_24']]

In [16]:
appended_df = pd.concat([KGL_dat_tmx, NBO_dat_tmx, CT_dat_tmx], axis=1)
appended_df

Unnamed: 0,t_max_24,t_max_24.1,t_max_24.2
0,22.0,23.0,22.0
1,25.0,25.0,30.0
2,24.0,26.0,29.0
3,25.0,26.0,30.0
4,27.0,26.0,24.0
...,...,...,...
723,25.0,20.0,25.0
724,26.0,25.0,25.0
725,24.0,24.0,21.0
726,24.0,25.0,28.0


In [17]:
KGL_dat_tmx = KGL_dat_tmx.rename(columns={'t_max_24': 'col1'})
NBO_dat_tmx = NBO_dat_tmx.rename(columns={'t_max_24': 'col2'})
CT_dat_tmx = CT_dat_tmx.rename(columns={'t_max_24': 'col3'})


In [6]:
appended_df = pd.concat([KGL_dat_tmx, NBO_dat_tmx, CT_dat_tmx], axis=1)
appended_df

Unnamed: 0,col1,col2,col3
0,22.0,23.0,22.0
1,25.0,25.0,30.0
2,24.0,26.0,29.0
3,25.0,26.0,30.0
4,27.0,26.0,24.0
...,...,...,...
723,25.0,20.0,25.0
724,26.0,25.0,25.0
725,24.0,24.0,21.0
726,24.0,25.0,28.0


In [7]:
# column_names = KGL_dat.columns.tolist()
# # Transform copula data using the empirical distribution    
# KGL_data = pv.to_pseudo_obs(KGL_dat)
# KGL_data = pd.DataFrame(KGL_data, columns= column_names)

# KGL_data.to_csv("KGL_data_Emp.csv", index=False)


In [8]:
appended_df.corr()

Unnamed: 0,col1,col2,col3
col1,1.0,0.185563,-0.170408
col2,0.185563,1.0,0.376979
col3,-0.170408,0.376979,1.0


In [11]:

def process_dataset(dataset):
    # get the column names
    columns = dataset.columns

    # get the combinations of the columns
    comb = combinations(columns, 2)

    # List of copulas to test
    copulas = [pv.Bicop(pv.BicopFamily.gaussian), pv.Bicop(pv.BicopFamily.student),
               pv.Bicop(pv.BicopFamily.clayton), pv.Bicop(pv.BicopFamily.gumbel),
               pv.Bicop(pv.BicopFamily.frank)]

    # Create an empty dataframe to store the results
    result_df = pd.DataFrame(columns=["col1", "col2", "BIC", "LogLikelihood", "AIC", "Copula", "BicopFamily"])

    for cols in comb:
        col1, col2 = cols
        # select the columns
        X = pd.to_numeric(dataset[col1], errors='coerce')
        Y = pd.to_numeric(dataset[col2], errors='coerce')
        Z = np.array(list(zip(X, Y)))  # Convert to numpy array

        # Transform copula data using the empirical distribution
        u = pv.to_pseudo_obs(Z)

        # initialize the best BIC, loglikelihood, AIC, and copula
        best_bic = float("inf")
        best_loglik = -float("inf")
        best_aic = float("inf")
        best_copula = None

        for copula in copulas:
            copula.fit(data=u)
            loglik = copula.loglik(u)
            aic = copula.aic(u)

            if aic < best_aic:
                best_aic = aic
                best_bic = copula.bic(u)
                best_loglik = loglik
                best_copula = copula

        # print the results of the best copula
        print("Columns: ", cols)
        print("BIC: ", best_bic)
        print("LogLikelihood: ", best_loglik)
        print("AIC: ", best_aic)
        print("Copula: ", best_copula)
        print("BicopFamily: ", best_copula.family)

        # Append the results to the dataframe
        result_df = result_df.append({"col1": col1, "col2": col2, "BIC": best_bic, "LogLikelihood": best_loglik,
                                      "AIC": best_aic, "Copula": str(best_copula), "BicopFamily": best_copula.family},
                                     ignore_index=True)

    return result_df


In [12]:
# datasets = [NBO_dat, CT_dat, KGL_dat]  # Assuming NBO_dat, CT_dat, and KGL_dat are the actual datasets
datasets = appended_df
# Process each dataset and store the results
results = []
for idx, dataset in enumerate(datasets):
    result = process_dataset(dataset)
    results.append(result)

    # Save the results to a CSV file
    result.to_csv(f"result_dataset_{idx+1}.csv", index=False)

    print(f"Results for Dataset {idx+1} saved to result_dataset_{idx+1}.csv.")
    print()

# Print or further process the results as needed
for idx, result in enumerate(results):
    print(f"Results for Dataset {idx+1}:")
    print(result)
   


AttributeError: 'str' object has no attribute 'columns'