In [None]:
#%pip install protobuf==3.20.1

%pip install -U sentence-transformers
%pip install umap-learn


In [3]:
QPATH = "Quantlet/Corpus_token_identification"
PATH = f'/content/drive/MyDrive/ColabNotebooks/IRTG/Encode_the_Qode/Encode-the-Qode/{QPATH}'

In [4]:
import sys
IN_COLAB = 'google.colab' in sys.modules

import os
if IN_COLAB:
  os.chdir(PATH)

sys.path.append('../src')

In [5]:
import pickle
import json
import re
import sys
from IPython.display import display

from tqdm import tqdm
tqdm.pandas()
import pandas as pd
import numpy as np
import umap

import matplotlib.pyplot as plt
import seaborn as sns

import torch

import nltk
nltk.download('punkt')

import importlib
import preprocessing_utils
importlib.reload(preprocessing_utils)

from sentence_transformers import SentenceTransformer

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.metrics.pairwise import cosine_similarity
from sklearn import preprocessing

from scipy import stats
import random

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    AutoConfig
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [6]:
with open('../../data/preprocessed/Quantlet/Parsed_Qs_with_code_25062023.pkl', 'rb') as file:
  df = pickle.load(file)

CLEAN_UP = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [7]:
RS = 1
N_NEIGHB = 10
MIN_DIST = 0.25

In [8]:
df = df[df.metainfo_file!='empty']
print(df.shape)

(4856, 6)


In [9]:
def parse_meta(row):
    row = row['metainfo_file']
    if row=='empty':
        return ['','','','']
    dict_keys = list(row.keys())
    dict_key_n = [k.lower() for k in dict_keys]
    name_idx = np.where(['name' in k for k in dict_key_n])[0]
    desc_idx = np.where(['desc' in k for k in dict_key_n])[0]
    key_idx = np.where(['keyw' in k for k in dict_key_n])[0]
    auth_idx = np.where(['auth' in k for k in dict_key_n])[0]

    dict_keys_used = []

    if len(name_idx) > 0:
        name = row[dict_keys[name_idx[0]]]
        dict_keys_used.append(name)
    else:
        name = ''
    if len(desc_idx) > 0:
        desc = row[dict_keys[desc_idx[0]]]
        dict_keys_used.append(desc)
    else:
        desc = ''
    if len(key_idx) > 0:
        key = row[dict_keys[key_idx[0]]]
        dict_keys_used.append(key)
    else:
        key = ''

    if len(auth_idx) > 0:
        aut = row[dict_keys[auth_idx[0]]]
        dict_keys_used.append(aut)
    else:
        aut = ''

    other = {k: row[k] for k in dict_keys if k not in dict_keys_used}
    return [name, desc, key, aut, other]

In [10]:
# Parse metainfo file
if 'Keywords' not in df.columns:
  meta_info = pd.DataFrame(columns=['Quantlet', 'Description', 'Keywords', 'Other'])

  meta_info[['Quantlet', 'Description', 'Keywords', 'Authors', 'Other']] = df.apply(
      lambda x: parse_meta(x),
      axis='columns',
      result_type='expand'
      )

  for col in meta_info.columns:
      meta_info[col] = meta_info[col].astype(str)

  df = pd.concat([df, meta_info], axis=1)

  del df['metainfo_file']
  del df['Other']
  del df['script_name']
  del df['script_name_no_ext']

In [11]:
df.Quantlet.nunique()

2402

In [12]:
df['multiple_scripts'] = df['code_script'].apply(lambda x: any(isinstance(i, list) for i in x))
df['code_script_joined'] = ''
df.loc[df['multiple_scripts']==True, 'code_script_joined'] = df.loc[df['multiple_scripts']==True, 'code_script'].apply(lambda x: [''.join(code_script) for code_script in x])
df.loc[df['multiple_scripts']!=True, 'code_script_joined'] = df.loc[df['multiple_scripts']!=True, 'code_script'].apply(lambda x: [''.join(x)])
df['scr_n'] = df['code_script_joined'].apply(len)

In [13]:
# Create one dataset, each script separate, with the same description and keywords

df_long = df.explode('code_script_joined')
# Create one dataset, one metainfo file, all scripts together


In [14]:
def explode_code_and_lang(df):
    new_df = pd.DataFrame()

    print(f'Shape before exploding scripts: {df.shape}')

    for index, row in tqdm(df.iterrows()):
        if row['multiple_scripts']==True:
          for i, script in enumerate(row['code_script']):
              row['main_script'] = script
              row['main_type_script'] = row['type_script'][i]
              new_df = new_df.append(row)
        else:
          new_df = new_df.append(row)

    new_df['main_script'] = new_df['main_script'].fillna(new_df['code_script'])
    new_df['main_type_script'] = new_df['main_type_script'].fillna(new_df['type_script'])

    new_df = new_df.reset_index(drop=True)
    print(f'Shape after exploding scripts: {new_df.shape}')
    return new_df

df_long = explode_code_and_lang(df)

df_long['code_script'] = df_long['main_script'].progress_apply(lambda x: ''.join(x) if len(x)>0 else np.nan)
df_long['type_script'] = df_long['main_type_script']

del df_long['main_type_script']
del df_long['main_script']
del df_long['code_script_joined']

Shape before exploding scripts: (4856, 10)


4856it [00:20, 240.47it/s]


Shape after exploding scripts: (6743, 12)


100%|██████████| 6743/6743 [00:00<00:00, 73490.21it/s]


In [15]:
model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [16]:
df_deduplicated = df_long.drop_duplicates(subset=['folder_name', 'code_script'])
df_deduplicated = df_deduplicated.reset_index(drop=True)

df_deduplicated['repo'] = df_deduplicated['folder_name'].apply(lambda x: re.sub(r'.+(data\/QuantLet\/)([^\/]+)(\/).+', r'\2' , x))
df_deduplicated['repo'] = df_deduplicated['repo'].apply(lambda x: re.sub(r'.+(data\/QuantLet\/)([^\/]+)', r'\2' , x))
print(df_deduplicated.shape)


(4857, 10)


In [17]:
df_python = df_deduplicated[df_deduplicated.type_script=='py']
print(df_python.shape)

(825, 10)


In [19]:
df_deduplicated.Quantlet.nunique()

2402

In [None]:
model_name = "sshleifer/distilbart-xsum-12-3"

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
pattern = re.compile(r'[a-zA-Z0-9äüö]+', re.DOTALL|re.MULTILINE)

In [None]:
import keyword
build_in = []
tokens = re.findall(pattern, df_deduplicated.loc[1, 'code_script'])


for token in tokens:
  if token in dir(__builtins__) or token in keyword.kwlist:
      build_in.append(token)

In [None]:
import inspect
import pandas

ds = []
library_module = pandas  # Replace with the actual library module
for token in tokens:
  print()
  if hasattr(library_module, token) and inspect.isfunction(getattr(library_module, token)):
    ds.append(token)

In [None]:
ds

['merge', 'merge', 'test', 'test', 'test', 'test', 'test', 'merge']