In [1]:
import os
import gc
import pandas as pd

import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [3]:
###########################################
########### FUNCTIONS #####################
###########################################



def generate_summary(test_samples, model, tokenizer, encoder_max_length, decoder_max_length):

    inputs = tokenizer(
        test_samples,
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=decoder_max_length)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str

In [4]:
examples = ["""lapply(libraries, function(x) if (!(x %in% installed.packages())) {
    install.packages(x)
})
lapply(libraries, library, quietly = TRUE, character.only = TRUE)

DAX = read.csv("data_DAX091222-111222.csv")
dax1 = DAX[2:(length(DAX[, 7]) - 1), 7]
dax2 = DAX[3:length(DAX[, 7]), 7]
z = log(dax1) - log(dax2)  # log returns

ks.test(z, "pnorm", list(mean = 0, sd = 1), H = NA, sim = 500, tol = 1e-05, estfun = NA) 
""", 
"""close all
clear all
clc

disp('Please input number of draws n as') ;
disp(' ') ;
n  = input('[n]=');
y  = normrnd(0,1,n,1);      % Generate standard normal random numbers
y  = sort(y);
cdfplot(y)                  % Plot the empirical distribution function
hold on
f  = cdf('Normal',y,0,1);   % Generate normal cumulative distribution function

plot(y,f,'r','LineWidth',2.5)
legend('Empirical','Theoretical','Location','NW')
title('EDF and CFD')
xlabel('X')
ylabel('EDF(X), CDF(X)')
grid off

[g,y] = ecdf(y);
g     = g(2:(n+1));
[C,I] = max(abs(f-g))
hold off
]""", """
df = pd.read_csv('data/EECI_TWh - TWh per Year-2020-12-09.csv', na_values='#DIV/0!')
df.fillna(method='ffill', inplace=True)
df['Date'] = pd.to_datetime(df['Date'])
df.index = df['Date']
df.drop(['Date'], axis=1, inplace=True)
fig, ax1 = plt.subplots(figsize=(20,10))

color = '#bdb2ff'
ax1.plot(df['Estimated TWh per Year'], color=color)
color = '#023047'
ax1.plot(df['Minimum TWh per Year'], color=color)
ax1.tick_params(axis='y', labelcolor=color)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)

fig.tight_layout() 
plt.savefig('ethereum_energy_consumption.png', transparent=True)""",

"""
from SFM_Hurst_Exponent.AR import ar
import os
os.chdir('SFM_Hurst_Exponent/')

# Execute Download
# from SFM_Hurst_Exponent import binance_download

# Read and select Close price (Header: Timestamp, OHLCV, ...)
p = pd.read_json('Binance_BTCUSDT_1m_1577836800000-1580342400000.json')
df = p[[4]]
df['changes'] = df.pct_change()
changevec = df['changes'].dropna() + 1
changevec = changevec.dropna()

random_changes = np.array(changevec)
series = np.cumprod(random_changes)

H_l = []
c_l = []
data_l = []
mse_l = []
series_splits = np.array_split(series, 100)

for currseries in series_splits:
    H, c, data = compute_Hc(currseries, kind='price', simplified=True)
    H_l.append(H)
    c_l.append(c)
    data_l.append(data)
    mse_l.append(ar(currseries - 1))  # Get back to simple returns around 0

# Evaluate Hurst equation for complete data set
H, c, data = compute_Hc(series, kind='price', simplified=True)

# Use Autoregressive Model to predict price for periods of differing H.
# Would expect high H periods (e.g. > 0.5) to have a smaller Mean Squared Error than other periods
# Then use some hypothesis test on the mean difference.

# Use t-test for mse_l small when H_l large
cutoff_int = 0.5
idx = np.where(np.array(H_l) > cutoff_int)
mse_supposedly_low = np.array(mse_l)[idx]""",
"""
gen_physical_density = function(tau, interest_rate, initial_price, days_to_maturity, curr_date, simmethod = 'SVCJ', n_simulations = 100, f = 'BTC_USD_QUANDL', cached_svcj_parameters = TRUE){
    
    days_to_maturity = max(days_to_maturity, 1)

    # Read historical Bitcoin Data
    if(f == 'BTCUSDT'){
        # Older, shorter data
        hist_dat = read.csv('data/BTCUSDT.csv', stringsAsFactors = FALSE)
        hist_dat$date = as.Date(hist_dat$Date)
        hist_dat = hist_dat[hist_dat$date <= curr_date,]
        hist_dat$Date = NULL
        tdat = ts(hist_dat$Adj.Close)
        rets = diff(log(tdat))
    }else if(f == 'BTC_USD_QUANDL'){
        
        hist_dat = read.csv('data/BTC_USD_Quandl.csv', stringsAsFactors = FALSE)
        hist_dat$date = as.Date(hist_dat$Date)
        hist_dat = hist_dat[hist_dat$date <= curr_date & hist_dat$date >= '2017-01-01',]
        hist_dat$Date = NULL
        hist_dat = hist_dat[rev(seq_len(nrow(hist_dat))), , drop = FALSE]
        tdat = ts(hist_dat$Adj.Close)
        rets = diff(log(tdat))
    }else{
        stop('no time series')
    }


    if(simmethod != 'SVCJ'){

        if(simmethod == 'garch'){
            print('using garch, delete this later!')

            # Estimate sigma with GARCH model
            # Parameters for calculation/simulation
            numbapprox  	= 2000			# fineness of the grid
            N		= n_simulations	# only run once, because the loop is happening in physicaldensity.R
            # Check return series for ARMA effects, e.g. with the following function
            # auto.arima(dax.retts, max.p=10, max.q=10, max.P=5, max.Q=5, 
            # start.p=1, start.q=1,start.P=1, start.Q=1, stationary=T, seasonal=F)
            p		= 0
            q		= 0
            arma= c(p,q)
            # specify garch order (need to be checked)
            m		= 1
            s		= 1
            garch		= c(m,s)
            garchmodel	= "eGARCH"
            submodel	= "GARCH"
            # underlying distribution (default: "sstd" - skewed stundent t's)
            # (alternatives: "norm" - normal, "ghyp"- generalized hyperbolic)
            udist		= "sstd"
            # set archm=T for ARCH in mean model (archpow specifies the power)
            archm		= F
            archpow		= 1
            # set include.mean = F if you don't want to include a mean in the mean model
            include.mean 	= T  
            spec			= ugarchspec(variance.model = list(model = garchmodel, 
                                garchOrder = garch, submodel = submodel), mean.model = 
                                list(armaOrder = arma, archm=archm,archpow=archpow,
                                include.mean=include.mean), distribution.model = udist)
            #garchfit = rugarch::ugarchfit(data = rets, spec = spec, solver = "hybrid")
            garchsim = ugarchsim(garchfit, n.sim = days_to_maturity, 
                                n.start = 0, m.sim=N, startMethod=("sample"), 
                                mexsimdata=TRUE)
            est_sigma = garchsim@simulation$sigmaSim""", 
            
            """rm(list=ls(all=TRUE))
graphics.off()
  
# install and load packages
libraries = c("kernlab","tseries","quadprog","zoo")
lapply(libraries, function(x) if (!(x %in% installed.packages())) {
    install.packages(x)})
lapply(libraries, library, quietly = TRUE, character.only = TRUE)  
  
  # generation of ideal data set: Xp with covariance (4,0,0,4) and Xn with covariance (0.25,0,0,0.25) for groups x(1) and x(-1)
p       = 4   # number of plots
n       = 200 # number of observations
set.seed(2)
# generating 2-variate data, member of group x(1)
  
sigma.p = matrix(c(4,0,0,4),2,2)
  
Mp      = t(chol(sigma.p))  				# Cholesky square root
Zp      = matrix(rnorm(n),2,100)  			# 2 row, 50 columns
Xp      = t(Mp %*% Zp)
Xp1     = Xp[,1]
Xp2     = Xp[,2]
  
  # generating 2-variate data, member of group x(-1)
  
sigma.n = matrix(c(0.25,0,0,0.25),2,2)
  
Mn      = t(chol(sigma.n))  				# Cholesky square root
Zn      = matrix(rnorm(n),2,100)  			# 2 row, 50 columns
Xn      = t(Mn %*% Zn)
Xn1     = Xn[,1]
Xn2     = Xn[,2]
  
  # Aggregating data
  
  X1      = c(Xp1,Xn1)
  X2      = c(Xp2,Xn2)
  
  # generating indicator variable
  
yp      = rep(1,n/2)
yn      = rep(-1,n/2)
  
Y       = c(yp,yn)
OP      = cbind(X2, X1)
  
## Main program of SVM classification plot
  
sgm = c(0.2,5,0.2,5)	# parameter r in anisotropic gaussian kernel
C = c(0.1,0.1,8,8)

for (i in 1:p){ 
  OrangePeelModel = ksvm(OP, Y, type="C-svc", kernel="rbfdot", kpar=list(sigma=sgm[i]), C=C[i], prob.model=TRUE, cross=4)
  str = paste("s=",sprintf("%0.1f",sgm[i]),", c=",sprintf("%0.1f",C[i]),"                          ",sep="")
  plot(OrangePeelModel, data=OP,cex=0.7,cex.main=1.5)
  title(sub=str)
  print(OrangePeelModel)
  }""", """% SMSsvmorange MATLAB Code
%
% translated from R by: Bey, Patrik (beypatri@gmail.com)
%

clear all

cd('./')                              %adjust working directory

%%
% generation of ideal data set: Xp with covariance (4,0,0,4) and Xn with covariance (0.25,0,0,0.25) for groups x(1) and x(-1)
%%

n = 200;                              % number of observations

%%
% generating 2-variate data, member of group x(1)
%%  
sigma_p = [4 0;0 4];
  
Mp      = chol(sigma_p);               % Cholesky square root
Zp      = randn(2,n/2);                % 2 row, 100 columns /in orig R-code 50 columns
Xp      = (Mp*Zp)';
Xp1     = Xp(:,1);
Xp2     = Xp(:,2);

%%
% generating 2-variate data, member of group x(-1)
%%
  
sigma_n = [0.25 0; 0 0.25];
  
Mn      = chol(sigma_n);               % Cholesky square root
Zn      = randn(2,n/2);                % 2 row, 100 columns /in orig R-code 50 columns
Xn      = (Mn*Zn)';
Xn1     = Xn(:,1);
Xn2     = Xn(:,2);

%%  
% Aggregating data
%%  

X1      = [Xp1;Xn1];
X2      = [Xp2;Xn2];
  
%%
%generating indicator variable
%%

yp      = ones(n/2,1);
yn      = -ones(n/2,1);
  
Y       = [yp;yn];                     % group labels used in SVM
OP      = [X2, X1]                     % training data used in SVM

%%%%%%
%% Main program of SVM classification plot
%%%%%%

%%
%define kernel parameter for SVM
%%
sgm = [0.2,5,0.2,5]'; %radial basis function kernel parameter sigma
C = [0.1,0.1,8,8]; %SVM cost function parameter C
  
for i = 1:length(sgm)
    figure(i)
    svmStruct = svmtrain(OP,Y,'ShowPlot',true,'kernel_function','rbf','rbf_sigma',sgm(i),'boxconstraint',C(i));
    title( ['SVM with sigma ' num2str(sgm(i)) ' and C ' num2str(C(i))])
end""", 
"""import pandas as pd
import requests
import re
import numpy as np
import ast
import time
from datetime import date
today = date.today()
We need to get the list of all hashes first
with open('address.txt', 'r') as file:
    address = file.read()
    
address_to_get_all = address + '?platform=ethereum&limit=100&page='

data = pd.DataFrame(columns=['category', 
                             'categories', 
                             'created', 
                             'name', 
                             'rank', 
                             'slug', 
                             'teaser'])
# we have the maximum of 20 pages, if the limit set to 100 apps per page
for page in range(1, 21):
    test = requests.get(f'{address_to_get_all}{page}')
    for i, item in enumerate(test.json()['items']):
        category = test.json()['items'][i]['categories']
        if len(category)>1:
            categories = ' '.join(category)
            category = category[0]
        else:
            categories = 0
            category = category[0]
        created = test.json()['items'][i]['created']
        name = test.json()['items'][i]['name']
        rank = test.json()['items'][i]['rank']
        slug = test.json()['items'][i]['slug']
        teaser = test.json()['items'][i]['teaser']
        df_temp = pd.DataFrame(data={'category': category, 
                                    'categories': categories,
                                    'created': created,
                                    'name': name,
                                    'rank': rank,
                                    'slug': slug,
                                    'teaser': teaser}, index=[0])
        data = pd.concat([data, df_temp])
    time.sleep(0.25)
data = data.drop('categories', axis=1)
data.to_csv(f'data/dapps_names_{today}.csv', index=False)
For each dapp, here or each slug, retrieve a list of hashes
data = pd.read_csv(f'data/dapps_names_{today}.csv')
address_for_dapp = f'{address}/'
data.reset_index(drop=True, inplace=True)
hashes = pd.DataFrame(columns=['slug', 'hash', 'description', 'license'])""", 
,
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kurtosis
import os

os.getcwd()
os.chdir("/Users/jane_hsieh/Library/CloudStorage/OneDrive-國立陽明交通大學/Data Science Analysis Templates/Machine Learning/Part 9 - Dimensionality Reduction/SectionExtra 3 - Independent Component Analysis (ICA)/FRM Data Analysis_Porfolio Construction_with ICA-PCA")  


data_dir = './Data'
output_dir = './Output'

# ====================================  0. Input data: FRM prices / returns ====================================================
# returns
df_return = pd.read_csv(data_dir+'/FRM_CHHKTW_Time_Series_Returns_20201030.csv', parse_dates=['Date'], index_col = 'Date')



col_chosen = [28, 159, 172, 182, 153, 108, 41, 154, 107, 173, 27, 122, 64, 163, 105] #[28, 159, 172, 182, 153] #
col_chosen = [i-1 for i in col_chosen]


stocks = df_return.columns[col_chosen];print(stocks)

#df_return = df_return.iloc[:,col_chosen]
#df_return.columns



df = df_return.iloc[:,col_chosen]
del df_return


df.columns = [i.replace('.EQUITY', '') for i in df.columns]
df.columns

# prices
df_price = pd.read_csv(data_dir+'/FRM_SHSZ300HSITWSE_Stock_Prices_update_20201030.csv', parse_dates=['Date'], index_col = 'Date')

stocks2 = df_price.columns[col_chosen];print(stocks2) #check if stocks2== stocks

df_price = df_price.iloc[:,col_chosen]
df_price.columns = df.columns


## 0.1 Missing data imputation ---------------------------------------------------------------
print("Numnber of missing data for returns:: \n", df.isnull().sum())
print("Numnber of missing data for prices: \n", df_price.isnull().sum())
'''
#Only sporatic missing points, hence we simply perform linear interpolation method to those missing values

df_return.fillna(0) #since supposed stock price has no change for missing value; i.e., df_price.fillna(method='ffill', inplace=True)
'''
df_price.fillna(method='ffill', inplace=True)




## 0.2 Visualization: Multidimensional Time Series Data Plot (FRM) ---------------------------------------------------------------
start = '2019-01-02'
end =  '2020-10-30'

df[start: end].plot(figsize=(15,6))
#plt.legend(fancybox=True, framealpha=0.0, loc = 'upper right', prop={'size': 8})
plt.legend(bbox_to_anchor=(1.02, 1),loc='upper left', borderaxespad=0., 
           fancybox=True, framealpha=0.0, prop={'size': 8})
plt.title(f'Daily returns of each stock (FRM@Asia) from {start} to {end}')

plt.show()
plt.savefig(data_dir+f'/Multidimensional Daily Returns (FRM) from {start} to {end} ({len(col_chosen)}D).png', transparent = True)
plt.close()"""]


example_URI = ["https://github.com/QuantLet/MSE/tree/master/MSEKolmogorov-Smirnovtest",
                "https://github.com/QuantLet/MSE/tree/master/MSEGCthmnorm",
                "https://github.com/QuantLet/USC/tree/master/SC-energy-consumption", 
                "https://github.com/QuantLet/SFM_Class_2019WS/tree/master/SFM_Hurst_Exponent",
                "https://github.com/QuantLet/BitcoinPricingKernels/tree/master/BitcoinPricingKernels", 
                "https://github.com/QuantLet/SMS2/tree/master/SMSsvmorange",
                "https://github.com/QuantLet/SMS2/tree/master/SMSsvmorange", 
                "https://github.com/QuantLet/USC/tree/master/SC-Dapp-scraping", 
                "https://github.com/QuantLet/DigitalEconomyDecisionAnalytics/tree/master/DEDA%20-%20Independent%20Component%20Analysis%20(ICA)/FRM%20Data%20Analysis_Porfolio%20Construction_with%20ICA-PCA"]
 
example_language = ["R", "M", "Py", "Py", "R", "R", "M", "Py", "Py"]

In [5]:
model_name_path = {#"FlanT5 ZS" : "google/flan-t5-base",
"FlanT5 FT": "/home/RDC/zinovyee.hub/H:/zinovyee.hub/IRTG/MLSC/Encode-the-Qode/Quantlet/4-qode2desc/reports/analysis_report_google/flan-t5-base_no_context_val_20231119_normal/results/checkpoint-10260",
"CodeT5 ZS" : "Salesforce/codet5-base-multi-sum",
"CodeT5 FS" : "/home/RDC/zinovyee.hub/H:/zinovyee.hub/IRTG/MLSC/Encode-the-Qode/Quantlet/4-qode2desc/reports/analysis_report_few_shot_CodeT5_no_context_test_20231104/results/checkpoint-1500",
"CodeT5 FT" : "/home/RDC/zinovyee.hub/H:/zinovyee.hub/IRTG/MLSC/Encode-the-Qode/Quantlet/4-qode2desc/reports/analysis_report_CodeT5_no_context_test_20231104/results/checkpoint-11000",
"CodeTrans ZS": "SEBIS/code_trans_t5_base_source_code_summarization_python_multitask",
"CodeTrans FS": "/home/RDC/zinovyee.hub/H:/zinovyee.hub/IRTG/MLSC/Encode-the-Qode/Quantlet/4-qode2desc/reports/analysis_report_few_shot_CodeTrans_no_context_test_20231104/results/checkpoint-1500",
"CodeTrans FT": "/home/RDC/zinovyee.hub/H:/zinovyee.hub/IRTG/MLSC/Encode-the-Qode/Quantlet/4-qode2desc/reports/analysis_report_CodeTrans_no_context_val_20231104/results/checkpoint-1000",}

In [6]:
results = {}
for model_name, model_path in model_name_path.items():
    print("Loading ", model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    print("Tokenizing with ", model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_path, skip_special_tokens=False)
    
    print("Generating with ", model_name)

    if model_name.startswith("Flan"):
        examples_inst = [f"Summarize the code: {example}" for example in examples]
        summaries = generate_summary(examples_inst, model, tokenizer, 512, 300)
    else: 
        summaries = generate_summary(examples, model, tokenizer, 512, 300)

    results[model_name] = summaries[1]
    print("Summmary :", summaries[1])
    
    print("Release cache")
    gc.collect()
    torch.cuda.empty_cache()

Loading  FlanT5 FT
Tokenizing with  FlanT5 FT
Generating with  FlanT5 FT
Summmary : ['Performs a Kolmogorov-Smirnov test on the DAX log return data from 2009-12-21 to 2011-12-22', 'Draws n observations from a standard normal distribution and plots its empirical distribution function vs. the normal cumulative distribution function. Number of draws can be entered interactively.', 'Estimates the optimal portfolio weights with a minimum spanning tree, and plots the optimal portfolio weights with a minimum spanning tree.', 'Use Autoregressive Model to predict price for periods of differing H.', "Reads historical Bitcoin Data, if (f == 'BTCUSDT') # Older, shorter data, hist_dat = read.csv", "'Computes the 2D map of 2 groups of the SVM classification using anipotropic Gaussian kernel for two different groups of the dataset.'", 'Generates a two-dimensional scatterplot of two-dimensional observations and two-dimensional scatterplots for the two-dimensional SVM data.', 'This Quantlet generates t

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Tokenizing with  CodeTrans ZS
Generating with  CodeTrans ZS
Summmary : ['# install.packages(library) return"', "disp(' '); = n; % Quit loop }; %", '# plt.show() # plt.show() # plt.show()', '# import pandas as pd import numpy as np # Execute Download # # # # # # Compute the mean difference # # Compute the mean difference # # Compute the', ',, max.Q=10, max.R=5) # return', ', #tseries_) # # # #) # # generating a model for each plot) # generating a model for each plot #,) # Generating a model for each plot) #', "%% % %% %','rbf_sigma','rbf_sigma'); %%", 'import os import json import json import json import json import json import json import json', 'coding: utf', '# # # # missing data for prices:: # missing data for prices:: #, df.isnull().sum())']
Release cache
Loading  CodeTrans FS
Tokenizing with  CodeTrans FS
Generating with  CodeTrans FS
Summmary : ['Computes the p-norm of the DAX in DAX order for the DAX of DAX DAX data from DAX091222-091222.', 'Draws n observations from standard no

In [19]:
FlanT5_ZS = ["# Close windows and clear variables rm(list = ls(all = TR", "[g,y] = ecdf(y); g = g", "import numpy as np import pandas as pd import matplot", "import pd, scipy, scipy, np import", "# # # # # # #", """# install and load packages libraries = c("kernlab","tseries""",
             "MATLAB Code % % translated from R by", "# import re import numpy as np import ast import time from date", 
             "#plt.legend(fancybox=True, framealpha"]

In [16]:
results_df = pd.DataFrame.from_dict(results)
results_df["FlanT5_ZS"] = FlanT5_ZS


In [20]:
results_df.to_csv("results_20240526.csv", index=False)

In [21]:
!pwd

/usr/net/zinovyee.hub/IRTG/MLSC/Encode-the-Qode/Quantlet/4-qode2desc
