In [None]:
import pandas as pd
import os
import time
import numpy as np
import torch

In [None]:
import seaborn as sns
from scipy import stats
from statistics import NormalDist
import matplotlib.pyplot as plt

In [None]:
#export
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
#! pip install dowhy
from dowhy import CausalModel
import dowhy.datasets, dowhy.plotter


In [None]:
import tensorflow as tf
#physical_devices = tf.config.list_physical_devices('CPU')
physical_devices = tf.config.list_physical_devices('GPU')

In [None]:
#device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device


In [None]:
%matplotlib inline

# Confirmatory Causal Analysis
>
> Conformatory Experiments of Logits or Next Token Predictions @danaderp
>

### Data Upload

In [None]:
pd_combined_models_local = pd.read_csv('/workspaces/CodeSyntaxConcept/data/ds_processed_logits_local/out_astevalverticalfiltered_local.csv', index_col=0)  

In [None]:
pd_combined_models_local.shape

In [None]:
pd_combined_models_local.head(1)

In [None]:
confounders = [ 'size', 'ast_levels', 'complexity', 'n_ast_nodes']
performance = [
               'for_statement', 
               'while_statement', 
               'return_statement',
               ']', 
               ')', 
               'if_statement', 
               'comparison_operator', 
               'boolean_operator',
               'for_in_clause', 
               'if_clause', 
               'list_comprehension', 
               'lambda',
               'identifier', 
               'string', 
               ]
outcome = ['loss']
features = ['id', 'type_model', 'size_model' ]

In [None]:
list_model = set(pd_combined_models_local.id.values)
list_model

In [None]:
list_model = {'gpt-3 [125M]','mono-lang [2B]'} #Considering only edge cases

## Descriptive Statistics

In [None]:
sns.set_theme(style="darkgrid")

In [None]:
pd_combined_models_local.groupby(['id']).describe() #.reset_index()

## Correlation Analysis

In [None]:
def pearson(np_x, np_y):
    corr, p_val = stats.pearsonr(np_x, np_y)
    #logging.info( f"correlation:{corr} pval:{p_val}" )
    return corr

In [None]:
correlations_performance = { col: [ [model, 
                                pearson( pd_combined_models_local[pd_combined_models_local['id']==model][col].values, pd_combined_models_local[pd_combined_models_local['id']==model]['loss'].values ), 
                                pearson( pd_combined_models_local[pd_combined_models_local['id']==model][col].values, pd_combined_models_local[pd_combined_models_local['id']==model]['complexity'].values ),
                                pearson( pd_combined_models_local[pd_combined_models_local['id']==model][col].values, pd_combined_models_local[pd_combined_models_local['id']==model]['ast_levels'].values ),
                                pearson( pd_combined_models_local[pd_combined_models_local['id']==model][col].values, pd_combined_models_local[pd_combined_models_local['id']==model]['n_ast_nodes'].values ),     
                                pearson( pd_combined_models_local[pd_combined_models_local['id']==model][col].values, pd_combined_models_local[pd_combined_models_local['id']==model]['size'].values)] for model in list_model]  for col in performance }

In [None]:
correlations_performance

In [None]:
correlations_loss = { col: [ [model, 
                                pearson( pd_combined_models_local[pd_combined_models_local['id']==model][col].values, pd_combined_models_local[pd_combined_models_local['id']==model]['loss'].values )] for model in list_model]  for col in confounders }

In [None]:
correlations_loss

## Causation Analysis

In [None]:
def causal_effect(
    df_data = pd_combined_models_local[pd_combined_models_local['id']=='gpt-3 [125M]'],
    treatment = ['for_statement'],
    outcome = ['loss'],
    common_causes = ["size"]
):
    '''
    Returns Causal Effect 
    placebo robust method
    random common cause variable
    '''
    # I. Create a causal model from the data and given graph.
    causal_model = CausalModel(
        data =  df_data, 
        treatment = treatment,
        outcome = outcome,
        common_causes = common_causes
        )
    # II. Identify causal effect and return target estimands
    identified_estimand = causal_model.identify_effect(proceed_when_unidentifiable=True)
    
    # III. Estimate the target estimand using a statistical method.
    propensity_strat_estimate = causal_model.estimate_effect(identified_estimand,
                                 method_name="backdoor.dowhy.linear_regression")
    
    # IV. Refute the obtained estimate using multiple robustness checks.
    refute_results = causal_model.refute_estimate(identified_estimand, propensity_strat_estimate,
                                       method_name="placebo_treatment_refuter")
    
    res_random=causal_model.refute_estimate(identified_estimand, propensity_strat_estimate, method_name="random_common_cause")

    
    return propensity_strat_estimate.value, refute_results.new_effect, res_random.new_effect 

In [None]:
#TST
causal_eff, res_placebo, res_random = causal_effect()
#print(causal_eff, res_placebo, res_random )

In [23]:
causation_concepts = { col: [ [model, 
                                causal_effect( 
                                              df_data = pd_combined_models_local[pd_combined_models_local['id']==model], 
                                              treatment = [col], 
                                              outcome = outcome,
                                              common_causes = confounders #controlling all confounders
                                              ), \
                            ] for model in list_model]  for col in performance }

2023-05-02 02:48:30,889 : INFO : Using a Normal Distribution with Mean:0 and Variance:0
2023-05-02 02:48:30,902 : INFO : b: loss~placebo+n_ast_nodes+size+ast_levels+complexity
2023-05-02 02:48:30,935 : INFO : Using a Normal Distribution with Mean:0 and Variance:0
2023-05-02 02:48:30,963 : INFO : b: loss~placebo+n_ast_nodes+size+ast_levels+complexity
2023-05-02 02:48:31,052 : INFO : Using a Normal Distribution with Mean:0 and Variance:0
2023-05-02 02:48:31,064 : INFO : b: loss~placebo+n_ast_nodes+size+ast_levels+complexity
2023-05-02 02:48:31,152 : INFO : Using a Normal Distribution with Mean:0 and Variance:0
2023-05-02 02:48:31,175 : INFO : b: loss~placebo+n_ast_nodes+size+ast_levels+complexity
2023-05-02 02:48:31,227 : INFO : Using a Normal Distribution with Mean:0 and Variance:0
2023-05-02 02:48:31,240 : INFO : b: loss~placebo+n_ast_nodes+size+ast_levels+complexity
2023-05-02 02:48:31,274 : INFO : Using a Normal Distribution with Mean:0 and Variance:0
2023-05-02 02:48:31,286 : INFO :

KeyboardInterrupt: 

In [None]:
causation_concepts

### Visual Correlations

In [None]:
#Check here: https://seaborn.pydata.org/generated/seaborn.pairplot.html
g = sns.pairplot(data=pd_processed_logits)
g.map_upper(sns.scatterplot,marker=".")
g.map_lower(sns.kdeplot,levels=3,color=".2")
g.map_diag(sns.histplot)

In [None]:
sns.set_style("darkgrid")
pd_processed_logits[pd_processed_logits['model']=='125M'].plot.scatter(
    x = 'function_definition',
    y = 'loss',
    c = 'size', #Common Causes
    s = 1,
    cmap="magma" #cmap="viridis" plasma
)

In [None]:
sns.set_style("darkgrid")
pd_processed_logits[pd_processed_logits['model']=='1.3B'].plot.scatter(
    x = 'function_definition',
    y = 'loss',
    c = 'size', #Common Causes
    s = 1,
    cmap="magma" #cmap="viridis" plasma
)

In [None]:
pd_processed_logits[pd_processed_logits['model']=='2.7B'].plot.scatter(
    x = 'function_definition',
    y = 'loss',
    c = 'size', #Common Causes
    s = 1,
    cmap="magma" #cmap="viridis" plasma
)

In [None]:
pd_processed_logits[pd_processed_logits['model']=='125M'].plot.scatter(
    y = 'function_definition',
    x = 'size',
    c = 'loss', #Common Causes
    s = 1,
    cmap="magma" #cmap="viridis" plasma
)

In [None]:
pd_processed_logits[pd_processed_logits['model']=='1.3B'].plot.scatter(
    y = 'function_definition',
    x = 'size',
    c = 'loss', #Common Causes
    s = 1,
    cmap="magma" #cmap="viridis" plasma
)

In [None]:
pd_processed_logits[pd_processed_logits['model']=='2.7B'].plot.scatter(
    y = 'function_definition',
    x = 'size',
    c = 'loss', #Common Causes
    s = 1,
    cmap="magma" #cmap="viridis" plasma
)

In [None]:
#Check here for more plots: https://seaborn.pydata.org/tutorial/regression.html
#g = sns.relplot(data=data['df'], x="v0", y="y")
pd_processed_logits[pd_processed_logits['model']=='125M'].plot.scatter(
    x = 'expression_statement',
    y = 'loss',
    c = 'size', #Common Causes
    s = 1,
    cmap="magma" #cmap="viridis" plasma
)

In [None]:
pd_processed_logits[pd_processed_logits['model']=='1.3B'].plot.scatter(
    x = 'expression_statement',
    y = 'loss',
    c = 'size', #Common Causes
    s = 1,
    cmap="magma" #cmap="viridis" plasma
)

In [None]:
pd_processed_logits[pd_processed_logits['model']=='2.7B'].plot.scatter(
    x = 'expression_statement',
    y = 'loss',
    c = 'size', #Common Causes
    s = 1,
    cmap="magma" #cmap="viridis" plasma
)

In [None]:
pd_processed_logits[pd_processed_logits['model']=='2.7B'].plot.scatter(
    x = 'expression_statement',
    y = 'loss',
    c = 'size', #Common Causes
    s = 1,
    cmap="magma" #cmap="viridis" plasma
)

## Causal Analysis (implementation)

In [None]:
# Testing Block
def causal_data( ):
    data = dowhy.datasets.linear_dataset(
        beta=2,
        num_common_causes=20,
        num_instruments=0,
        num_samples=10000,
        treatment_is_binary=False,
        stddev_treatment_noise = 10 )
    
    return data

In [None]:
pd_processed_logits[pd_processed_logits['model']=='125M'].head(1)

In [None]:
def causal_effect(
    df_data = pd_processed_logits[pd_processed_logits['model']=='125M'],
    treatment = ['attribute'],
    outcome = ['loss'],
    common_causes = ["size"]
):
    '''
    Returns Causal Effect and placebo robust method
    '''
    # I. Create a causal model from the data and given graph.
    causal_model = CausalModel(
        data =  df_data, 
        treatment = treatment,
        outcome = outcome,
        common_causes = common_causes
        )
    # II. Identify causal effect and return target estimands
    identified_estimand = causal_model.identify_effect(proceed_when_unidentifiable=True)
    
    # III. Estimate the target estimand using a statistical method.
    propensity_strat_estimate = causal_model.estimate_effect(identified_estimand,
                                 method_name="backdoor.dowhy.linear_regression")
    
    # IV. Refute the obtained estimate using multiple robustness checks.
    refute_results = causal_model.refute_estimate(identified_estimand, propensity_strat_estimate,
                                       method_name="placebo_treatment_refuter")
    
    res_random=causal_model.refute_estimate(identified_estimand, propensity_strat_estimate, method_name="random_common_cause")

    
    return propensity_strat_estimate.value, refute_results.new_effect, res_random.new_effect 
    

In [None]:
# I. Create a causal model from the data and given graph.
causal_model = CausalModel(
    data =  pd_processed_logits[pd_processed_logits['model']=='125M'], #data["df"],
    treatment = ['attribute'],
    outcome = ['loss'],
    common_causes = ["size"]
    #instruments=data["instrument_names"]
    )

In [None]:
# II. Identify causal effect and return target estimands
identified_estimand = causal_model.identify_effect(proceed_when_unidentifiable=True)
logging.info(identified_estimand)

In [None]:
# III. Estimate the target estimand using a statistical method.
propensity_strat_estimate = causal_model.estimate_effect(identified_estimand,
                                 method_name="backdoor.dowhy.linear_regression")

print(propensity_strat_estimate)

In [None]:
propensity_strat_estimate.value 

In [None]:
dowhy.plotter.plot_causal_effect(
    propensity_strat_estimate, 
    pd_processed_logits[pd_processed_logits['model']=='125M']['attribute'], 
    pd_processed_logits[pd_processed_logits['model']=='125M']['loss']
    )  

In [None]:
sns.set_style("darkgrid")
plt.rcParams["figure.figsize"] = plt.rcParamsDefault["figure.figsize"]
plt.rcParams.update({'font.size': 22})


pd_processed_logits[pd_processed_logits['model']=='125M'].plot.scatter(
    x = 'attribute',
    y = 'loss',
    c = 'size', #Common Causes
    s = 1,
    cmap="magma" #cmap="viridis" plasma
)

In [None]:
# IV. Refute the obtained estimate using multiple robustness checks.
refute_results = causal_model.refute_estimate(identified_estimand, propensity_strat_estimate,
                                       method_name="placebo_treatment_refuter")
print(refute_results)

In [None]:
refute_results.new_effect