In [12]:
import os
import math
import json
import base64
import subprocess
import pandas as pd
from dotenv import load_dotenv
from flask import Flask, request, jsonify
from flask_cors import CORS
from interpreter import interpreter
from annoy import AnnoyIndex
from sentence_transformers import SentenceTransformer
from langchain_openai import OpenAIEmbeddings

load_dotenv()
embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))
model = SentenceTransformer('sentence-transformers/allenai-specter', device='cpu')



In [2]:
from build_embeddings import build_embeddings, get_file_embeddings
from search import get_embeddings_for_text, query_top_files_specter, get_total_files, query_top_files, get_common_files_with_avg_score, get_unique_files

In [3]:
def decode_json_object(array):
    files = {}
    for file_name, file_content in array.items():
        if file_name.endswith(('.py', '.sh', '.java', '.php', '.js', '.htm', '.html', '.vue')):
            string_base64 = array[file_name]['content']
            decodedBytes = base64.b64decode(string_base64)
            files[file_name] = decodedBytes.decode("utf-8")
    return files

In [4]:
repo_id = 323
encoded_repo = {
  "index.php": {
    "type": "blob",
    "encode": "base64",
    "content": "PD9waHAKcmVxdWlyZSBfX0RJUl9fIC4gJy8uLi92ZW5kb3IvYXV0b2xvYWQucGhwJzsKCnVzZSBEb3RlbnZcRG90ZW52OwoKLy8gTG9hZCBlbnZpcm9ubWVudCB2YXJpYWJsZXMKJGRvdGVudiA9IERvdGVudjo6Y3JlYXRlSW1tdXRhYmxlKF9fRElSX18gLiAnLy4uLycpOwokZG90ZW52LT5sb2FkKCk7CgovLyBJbmNsdWRlIGFwcGxpY2F0aW9uIGxvZ2ljCnJlcXVpcmUgX19ESVJfXyAuICcvLi4vc3JjL0FwcC5waHAnOwoKJGFwcCA9IG5ldyBBcHAoKTsKJGFwcC0+cnVuKCk7"
  },
  ".env.example": {
    "type": "blob",
    "encode": "base64",
    "content": "QVBQX0VOVj1kZXZlbG9wbWVudApEQl9IT1NUPWxvY2FsaG9zdApEQl9VU0VSPXJvb3QKREJfUEFTUz0KREJfTkFNRT0="
  },
  "app.php": {
    "type": "blob",
    "encode": "base64",
    "content": "PD9waHAKCmNsYXNzIEFwcAp7CiAgICBwcml2YXRlICRkYjsKCiAgICAvLyBDb25maWd1cmF0aW9uIHNldHRpbmdzCiAgICBwcml2YXRlICRjb25maWcgPSBbCiAgICAgICAgJ2VuY3J5cHRpb24nID0+IFsKICAgICAgICAgICAgJ2FsZ29yaXRobScgPT4gJ0FFUy0yNTYtQ0JDJywKICAgICAgICAgICAgJ2tleScgPT4gJ3lvdXItZW5jcnlwdGlvbi1rZXknLCAvLyBDaGFuZ2UgdG8geW91ciBhY3R1YWwgZW5jcnlwdGlvbiBrZXkgKDMyIGJ5dGVzIGZvciBBRVMtMjU2KQogICAgICAgICAgICAnaXZfbGVuZ3RoJyA9PiAxNgogICAgICAgIF0sCiAgICAgICAgJ2hhc2hpbmcnID0+IFsKICAgICAgICAgICAgJ2FsZ29yaXRobScgPT4gUEFTU1dPUkRfREVGQVVMVCwKICAgICAgICAgICAgJ29wdGlvbnMnID0+IFsnY29zdCcgPT4gMTJdCiAgICAgICAgXSwKICAgICAgICAncmFuZG9tJyA9PiBbCiAgICAgICAgICAgICdndWlkX2xlbmd0aCcgPT4gMTYsCiAgICAgICAgICAgICdmaWxlX25hbWVfbGVuZ3RoJyA9PiAxNiwKICAgICAgICAgICAgJ3N0cmluZ19sZW5ndGgnID0+IDE2CiAgICAgICAgXQogICAgXTsKCiAgICBwdWJsaWMgZnVuY3Rpb24gX19jb25zdHJ1Y3QoKQogICAgewogICAgICAgICR0aGlzLT5jb25uZWN0VG9EYXRhYmFzZSgpOwogICAgfQoKICAgIHByaXZhdGUgZnVuY3Rpb24gY29ubmVjdFRvRGF0YWJhc2UoKQogICAgewogICAgICAgICRob3N0ID0gZ2V0ZW52KCdEQl9IT1NUJyk7CiAgICAgICAgJGRiICAgPSBnZXRlbnYoJ0RCX05BTUUnKTsKICAgICAgICAkdXNlciA9IGdldGVudignREJfVVNFUicpOwogICAgICAgICRwYXNzID0gZ2V0ZW52KCdEQl9QQVNTJyk7CiAgICAgICAgJGNoYXJzZXQgPSAndXRmOG1iNCc7CgogICAgICAgICRkc24gPSAibXlzcWw6aG9zdD0kaG9zdDtkYm5hbWU9JGRiO2NoYXJzZXQ9JGNoYXJzZXQiOwogICAgICAgICRvcHRpb25zID0gWwogICAgICAgICAgICBQRE86OkFUVFJfRVJSTU9ERSAgICAgICAgICAgID0+IFBETzo6RVJSTU9ERV9FWENFUFRJT04sCiAgICAgICAgICAgIFBETzo6QVRUUl9ERUZBVUxUX0ZFVENIX01PREUgPT4gUERPOjpGRVRDSF9BU1NPQywKICAgICAgICAgICAgUERPOjpBVFRSX0VNVUxBVEVfUFJFUEFSRVMgICA9PiBmYWxzZSwKICAgICAgICBdOwoKICAgICAgICB0cnkgewogICAgICAgICAgICAkdGhpcy0+ZGIgPSBuZXcgUERPKCRkc24sICR1c2VyLCAkcGFzcywgJG9wdGlvbnMpOwogICAgICAgIH0gY2F0Y2ggKFBET0V4Y2VwdGlvbiAkZSkgewogICAgICAgICAgICBlcnJvcl9sb2coJGUtPmdldE1lc3NhZ2UoKSk7CiAgICAgICAgICAgIGlmIChnZXRlbnYoJ0FQUF9FTlYnKSA9PT0gJ2RldmVsb3BtZW50JykgewogICAgICAgICAgICAgICAgdGhyb3cgbmV3IFBET0V4Y2VwdGlvbigkZS0+Z2V0TWVzc2FnZSgpLCAoaW50KSRlLT5nZXRDb2RlKCkpOwogICAgICAgICAgICB9IGVsc2UgewogICAgICAgICAgICAgICAgdGhyb3cgbmV3IEV4Y2VwdGlvbignRGF0YWJhc2UgY29ubmVjdGlvbiBmYWlsZWQuJyk7CiAgICAgICAgICAgIH0KICAgICAgICB9CiAgICB9CgogICAgcHVibGljIGZ1bmN0aW9uIHJ1bigpCiAgICB7CiAgICAgICAgLy8gQXBwbGljYXRpb24gbG9naWMgaGVyZQogICAgICAgIGVjaG8gIkFwcGxpY2F0aW9uIGlzIHJ1bm5pbmcuIjsKCiAgICAgICAgJHVzZXIgPSAkdGhpcy0+Z2V0VXNlckJ5SWQoMSk7CiAgICAgICAgZWNobyBodG1sc3BlY2lhbGNoYXJzKGpzb25fZW5jb2RlKCR1c2VyKSwgRU5UX1FVT1RFUywgJ1VURi04Jyk7CiAgICB9CgogICAgcHVibGljIGZ1bmN0aW9uIGdldFVzZXJCeUlkKCRpZCkKICAgIHsKICAgICAgICBpZiAoIWlzX251bWVyaWMoJGlkKSkgewogICAgICAgICAgICB0aHJvdyBuZXcgSW52YWxpZEFyZ3VtZW50RXhjZXB0aW9uKCdJbnZhbGlkIHVzZXIgSUQuJyk7CiAgICAgICAgfQoKICAgICAgICAkc3RtdCA9ICR0aGlzLT5kYi0+cHJlcGFyZSgnU0VMRUNUICogRlJPTSB1c2VycyBXSEVSRSBpZCA9IDppZCcpOwogICAgICAgICRzdG10LT5leGVjdXRlKFsnaWQnID0+ICRpZF0pOwogICAgICAgIHJldHVybiAkc3RtdC0+ZmV0Y2goKTsKICAgIH0KCi8vIEZ1bmN0aW9uIHRvIHZhbGlkYXRlIGFuZCBzYW5pdGl6ZSBpbnB1dAogICAgZnVuY3Rpb24gc2FuaXRpemVJbnB1dCgkZGF0YSkgewogICAgICAgIHJldHVybiBodG1sc3BlY2lhbGNoYXJzKHN0cmlwc2xhc2hlcyh0cmltKCRkYXRhKSkpOwogICAgfQoKICAgIC8vIEZ1bmN0aW9uIHRvIHZhbGlkYXRlIHBhc3N3b3JkCiAgICBmdW5jdGlvbiB2YWxpZGF0ZVBhc3N3b3JkKCRwYXNzd29yZCkgewogICAgICAgIC8vIFJlbW92ZSBtdWx0aXBsZSBzcGFjZXMKICAgICAgICAkcGFzc3dvcmQgPSBwcmVnX3JlcGxhY2UoJy9ccysvJywgJyAnLCAkcGFzc3dvcmQpOwogICAgICAgIC8vIENoZWNrIGlmIHBhc3N3b3JkIGxlbmd0aCBpcyBhdCBsZWFzdCAxMiBjaGFyYWN0ZXJzCiAgICAgICAgcmV0dXJuIHN0cmxlbigkcGFzc3dvcmQpID49IDEyOwogICAgfQoKLy8gRnVuY3Rpb24gdG8gZ2VuZXJhdGUgYSBzZWN1cmUgcmFuZG9tIEdVSUQKICAgIGZ1bmN0aW9uIGdlbmVyYXRlR3VpZCgkbGVuZ3RoKSB7CiAgICAgICAgJGRhdGEgPSByYW5kb21fYnl0ZXMoJGxlbmd0aCk7CiAgICAgICAgYXNzZXJ0KHN0cmxlbigkZGF0YSkgPT0gJGxlbmd0aCk7CgogICAgICAgIC8vIFNldCB2ZXJzaW9uIHRvIDAxMDAKICAgICAgICAkZGF0YVs2XSA9IGNocihvcmQoJGRhdGFbNl0pICYgMHgwZiB8IDB4NDApOwogICAgICAgIC8vIFNldCBiaXRzIDYtNyB0byAxMAogICAgICAgICRkYXRhWzhdID0gY2hyKG9yZCgkZGF0YVs4XSkgJiAweDNmIHwgMHg4MCk7CgogICAgICAgIHJldHVybiB2c3ByaW50ZignJXMlcy0lcy0lcy0lcy0lcyVzJXMnLCBzdHJfc3BsaXQoYmluMmhleCgkZGF0YSksIDQpKTsKICAgIH0KCi8vIEZ1bmN0aW9uIHRvIGdlbmVyYXRlIGEgc2VjdXJlIHJhbmRvbSBmaWxlIG5hbWUKICAgIGZ1bmN0aW9uIGdlbmVyYXRlUmFuZG9tRmlsZU5hbWUoJGxlbmd0aCkgewogICAgICAgIHJldHVybiBiaW4yaGV4KHJhbmRvbV9ieXRlcygkbGVuZ3RoIC8gMikpOwogICAgfQoKLy8gRnVuY3Rpb24gdG8gZ2VuZXJhdGUgYSBzZWN1cmUgcmFuZG9tIHN0cmluZwogICAgZnVuY3Rpb24gZ2VuZXJhdGVSYW5kb21TdHJpbmcoJGxlbmd0aCkgewogICAgICAgIHJldHVybiBiaW4yaGV4KHJhbmRvbV9ieXRlcygkbGVuZ3RoIC8gMikpOwogICAgfQoKLy8gRnVuY3Rpb24gdG8gZW5jcnlwdCBkYXRhCiAgICBmdW5jdGlvbiBlbmNyeXB0RGF0YSgkZGF0YSwgJGNvbmZpZykgewogICAgICAgICRpdiA9IHJhbmRvbV9ieXRlcygkY29uZmlnWydlbmNyeXB0aW9uJ11bJ2l2X2xlbmd0aCddKTsgLy8gR2VuZXJhdGUgYSBzZWN1cmUgcmFuZG9tIElWCiAgICAgICAgJGVuY3J5cHRlZERhdGEgPSBvcGVuc3NsX2VuY3J5cHQoJGRhdGEsICRjb25maWdbJ2VuY3J5cHRpb24nXVsnYWxnb3JpdGhtJ10sICRjb25maWdbJ2VuY3J5cHRpb24nXVsna2V5J10sIDAsICRpdik7CiAgICAgICAgcmV0dXJuIGJhc2U2NF9lbmNvZGUoJGVuY3J5cHRlZERhdGEgLiAnOjonIC4gJGl2KTsKICAgIH0KCi8vIEZ1bmN0aW9uIHRvIGRlY3J5cHQgZGF0YQogICAgZnVuY3Rpb24gZGVjcnlwdERhdGEoJGRhdGEsICRjb25maWcpIHsKICAgICAgICBsaXN0KCRlbmNyeXB0ZWREYXRhLCAkaXYpID0gZXhwbG9kZSgnOjonLCBiYXNlNjRfZGVjb2RlKCRkYXRhKSwgMik7CiAgICAgICAgcmV0dXJuIG9wZW5zc2xfZGVjcnlwdCgkZW5jcnlwdGVkRGF0YSwgJGNvbmZpZ1snZW5jcnlwdGlvbiddWydhbGdvcml0aG0nXSwgJGNvbmZpZ1snZW5jcnlwdGlvbiddWydrZXknXSwgMCwgJGl2KTsKICAgIH0KCi8vIEZ1bmN0aW9uIHRvIHNlbmQgZW1haWwgbm90aWZpY2F0aW9uCiAgICBmdW5jdGlvbiBzZW5kRW1haWxOb3RpZmljYXRpb24oJHRvLCAkc3ViamVjdCwgJGJvZHkpIHsKICAgICAgICBnbG9iYWwgJGVtYWlsSG9zdCwgJGVtYWlsVXNlcm5hbWUsICRlbWFpbFBhc3N3b3JkLCAkZW1haWxGcm9tLCAkZW1haWxGcm9tTmFtZTsKCiAgICAgICAgJG1haWwgPSBuZXcgUEhQTWFpbGVyKHRydWUpOwogICAgICAgIHRyeSB7CiAgICAgICAgICAgIC8vIFNlcnZlciBzZXR0aW5ncwogICAgICAgICAgICAkbWFpbC0+aXNTTVRQKCk7CiAgICAgICAgICAgICRtYWlsLT5Ib3N0ID0gJGVtYWlsSG9zdDsKICAgICAgICAgICAgJG1haWwtPlNNVFBBdXRoID0gdHJ1ZTsKICAgICAgICAgICAgJG1haWwtPlVzZXJuYW1lID0gJGVtYWlsVXNlcm5hbWU7CiAgICAgICAgICAgICRtYWlsLT5QYXNzd29yZCA9ICRlbWFpbFBhc3N3b3JkOwogICAgICAgICAgICAkbWFpbC0+U01UUFNlY3VyZSA9IFBIUE1haWxlcjo6RU5DUllQVElPTl9TVEFSVFRMUzsKICAgICAgICAgICAgJG1haWwtPlBvcnQgPSA1ODc7CgogICAgICAgICAgICAvLyBSZWNpcGllbnRzCiAgICAgICAgICAgICRtYWlsLT5zZXRGcm9tKCRlbWFpbEZyb20sICRlbWFpbEZyb21OYW1lKTsKICAgICAgICAgICAgJG1haWwtPmFkZEFkZHJlc3MoJHRvKTsKCiAgICAgICAgICAgIC8vIENvbnRlbnQKICAgICAgICAgICAgJG1haWwtPmlzSFRNTCh0cnVlKTsKICAgICAgICAgICAgJG1haWwtPlN1YmplY3QgPSAkc3ViamVjdDsKICAgICAgICAgICAgJG1haWwtPkJvZHkgICAgPSAkYm9keTsKCiAgICAgICAgICAgICRtYWlsLT5zZW5kKCk7CiAgICAgICAgICAgIHJldHVybiB0cnVlOwogICAgICAgIH0gY2F0Y2ggKEV4Y2VwdGlvbiAkZSkgewogICAgICAgICAgICByZXR1cm4gZmFsc2U7CiAgICAgICAgfQogICAgfQoKLy8gRnVuY3Rpb24gdG8gaGFuZGxlIHVzZXIgc2lnbnVwCiAgICBmdW5jdGlvbiBzaWdudXAoJHVzZXJuYW1lLCAkcGFzc3dvcmQsICRlbWFpbCwgJGNvbm4sICRjb25maWcpIHsKICAgICAgICAvLyBTYW5pdGl6ZSBpbnB1dAogICAgICAgICR1c2VybmFtZSA9IHNhbml0aXplSW5wdXQoJHVzZXJuYW1lKTsKICAgICAgICAkcGFzc3dvcmQgPSBzYW5pdGl6ZUlucHV0KCRwYXNzd29yZCk7CiAgICAgICAgJGVtYWlsID0gc2FuaXRpemVJbnB1dCgkZW1haWwpOwoKICAgICAgICAvLyBWYWxpZGF0ZSBwYXNzd29yZAogICAgICAgIGlmICghdmFsaWRhdGVQYXNzd29yZCgkcGFzc3dvcmQpKSB7CiAgICAgICAgICAgIHJldHVybiAiUGFzc3dvcmQgbXVzdCBiZSBhdCBsZWFzdCAxMiBjaGFyYWN0ZXJzIGxvbmcgYWZ0ZXIgY29tYmluaW5nIG11bHRpcGxlIHNwYWNlcy4iOwogICAgICAgIH0KCiAgICAgICAgLy8gSGFzaCB0aGUgcGFzc3dvcmQgd2l0aCBhIHJhbmRvbWx5IGdlbmVyYXRlZCBzYWx0CiAgICAgICAgJGhhc2hlZFBhc3N3b3JkID0gcGFzc3dvcmRfaGFzaCgkcGFzc3dvcmQsICRjb25maWdbJ2hhc2hpbmcnXVsnYWxnb3JpdGhtJ10sICRjb25maWdbJ2hhc2hpbmcnXVsnb3B0aW9ucyddKTsKCiAgICAgICAgLy8gRW5jcnlwdCBlbWFpbAogICAgICAgICRlbmNyeXB0ZWRFbWFpbCA9IGVuY3J5cHREYXRhKCRlbWFpbCwgJGNvbmZpZyk7CgogICAgICAgIC8vIEdlbmVyYXRlIFRPVFAgc2VjcmV0CiAgICAgICAgJHRvdHAgPSBUT1RQOjpjcmVhdGUoKTsKICAgICAgICAkc2VjcmV0ID0gJHRvdHAtPmdldFNlY3JldCgpOwoKICAgICAgICAvLyBQcmVwYXJlIGFuZCBiaW5kCiAgICAgICAgJHN0bXQgPSAkY29ubi0+cHJlcGFyZSgiSU5TRVJUIElOVE8gdXNlcnMgKHVzZXJuYW1lLCBwYXNzd29yZCwgZW1haWwsIHRvdHBfc2VjcmV0KSBWQUxVRVMgKD8sID8sID8sID8pIik7CiAgICAgICAgJHN0bXQtPmJpbmRfcGFyYW0oInNzc3MiLCAkdXNlcm5hbWUsICRoYXNoZWRQYXNzd29yZCwgJGVuY3J5cHRlZEVtYWlsLCAkc2VjcmV0KTsKCiAgICAgICAgLy8gRXhlY3V0ZSB0aGUgc3RhdGVtZW50CiAgICAgICAgaWYgKCRzdG10LT5leGVjdXRlKCkpIHsKICAgICAgICAgICAgLy8gU2VuZCBub3RpZmljYXRpb24gZW1haWwKICAgICAgICAgICAgJHN1YmplY3QgPSAiU2lnbnVwIFN1Y2Nlc3NmdWwiOwogICAgICAgICAgICAkYm9keSA9ICJEZWFyICR1c2VybmFtZSw8YnI+PGJyPllvdXIgYWNjb3VudCBoYXMgYmVlbiBzdWNjZXNzZnVsbHkgY3JlYXRlZC48YnI+PGJyPlJlZ2FyZHMsPGJyPllvdXIgQXBwIE5hbWUiOwogICAgICAgICAgICBzZW5kRW1haWxOb3RpZmljYXRpb24oJGVtYWlsLCAkc3ViamVjdCwgJGJvZHkpOwoKICAgICAgICAgICAgLy8gRGlzcGxheSBRUiBjb2RlIGZvciBUT1RQCiAgICAgICAgICAgICRxckNvZGVVcmwgPSAkdG90cC0+Z2V0UHJvdmlzaW9uaW5nVXJpKCk7CiAgICAgICAgICAgIGVjaG8gIjxwPlNjYW4gdGhpcyBRUiBjb2RlIHdpdGggeW91ciBhdXRoZW50aWNhdG9yIGFwcDo8L3A+IjsKICAgICAgICAgICAgZWNobyAiPGltZyBzcmM9J2h0dHBzOi8vYXBpLnFyc2VydmVyLmNvbS92MS9jcmVhdGUtcXItY29kZS8\/ZGF0YT0iIC4gdXJsZW5jb2RlKCRxckNvZGVVcmwpIC4gIic+IjsKCiAgICAgICAgICAgIHJldHVybiAiU2lnbnVwIHN1Y2Nlc3NmdWwhIjsKICAgICAgICB9IGVsc2UgewogICAgICAgICAgICByZXR1cm4gIkVycm9yOiAiIC4gJHN0bXQtPmVycm9yOwogICAgICAgIH0KCiAgICAgICAgLy8gQ2xvc2UgdGhlIHN0YXRlbWVudAogICAgICAgICRzdG10LT5jbG9zZSgpOwogICAgfQoKLy8gRnVuY3Rpb24gdG8gaGFuZGxlIHBhc3N3b3JkIGNoYW5nZQogICAgZnVuY3Rpb24gY2hhbmdlUGFzc3dvcmQoJHVzZXJuYW1lLCAkbmV3UGFzc3dvcmQsICRjb25uLCAkY29uZmlnKSB7CiAgICAgICAgLy8gU2FuaXRpemUgaW5wdXQKICAgICAgICAkdXNlcm5hbWUgPSBzYW5pdGl6ZUlucHV0KCR1c2VybmFtZSk7CiAgICAgICAgJG5ld1Bhc3N3b3JkID0gc2FuaXRpemVJbnB1dCgkbmV3UGFzc3dvcmQpOwoKICAgICAgICAvLyBWYWxpZGF0ZSBuZXcgcGFzc3dvcmQKICAgICAgICBpZiAoIXZhbGlkYXRlUGFzc3dvcmQoJG5ld1Bhc3N3b3JkKSkgewogICAgICAgICAgICByZXR1cm4gIk5ldyBwYXNzd29yZCBtdXN0IGJlIGF0IGxlYXN0IDEyIGNoYXJhY3RlcnMgbG9uZyBhZnRlciBjb21iaW5pbmcgbXVsdGlwbGUgc3BhY2VzLiI7CiAgICAgICAgfQoKICAgICAgICAvLyBIYXNoIHRoZSBuZXcgcGFzc3dvcmQgd2l0aCBhIHJhbmRvbWx5IGdlbmVyYXRlZCBzYWx0CiAgICAgICAgJGhhc2hlZE5ld1Bhc3N3b3JkID0gcGFzc3dvcmRfaGFzaCgkbmV3UGFzc3dvcmQsICRjb25maWdbJ2hhc2hpbmcnXVsnYWxnb3JpdGhtJ10sICRjb25maWdbJ2hhc2hpbmcnXVsnb3B0aW9ucyddKTsKCiAgICAgICAgLy8gUHJlcGFyZSBhbmQgYmluZAogICAgICAgICRzdG10ID0gJGNvbm4tPnByZXBhcmUoIlVQREFURSB1c2VycyBTRVQgcGFzc3dvcmQgPSA\/IFdIRVJFIHVzZXJuYW1lID0gPyIpOwogICAgICAgICRzdG10LT5iaW5kX3BhcmFtKCJzcyIsICRoYXNoZWROZXdQYXNzd29yZCwgJHVzZXJuYW1lKTsKCiAgICAgICAgLy8gRXhlY3V0ZSB0aGUgc3RhdGVtZW50CiAgICAgICAgaWYgKCRzdG10LT5leGVjdXRlKCkpIHsKICAgICAgICAgICAgLy8gR2V0IHVzZXIncyBlbmNyeXB0ZWQgZW1haWwKICAgICAgICAgICAgJHN0bXQgPSAkY29ubi0+cHJlcGFyZSgiU0VMRUNUIGVtYWlsIEZST00gdXNlcnMgV0hFUkUgdXNlcm5hbWUgPSA\/Iik7CiAgICAgICAgICAgICRzdG10LT5iaW5kX3BhcmFtKCJzIiwgJHVzZXJuYW1lKTsKICAgICAgICAgICAgJHN0bXQtPmV4ZWN1dGUoKTsKICAgICAgICAgICAgJHN0bXQtPmJpbmRfcmVzdWx0KCRlbmNyeXB0ZWRFbWFpbCk7CiAgICAgICAgICAgICRzdG10LT5mZXRjaCgpOwogICAgICAgICAgICAkc3RtdC0+Y2xvc2UoKTsKCiAgICAgICAgICAgIC8vIERlY3J5cHQgZW1haWwKICAgICAgICAgICAgJGVtYWlsID0gZGVjcnlwdERhdGEoJGVuY3J5cHRlZEVtYWlsLCAkY29uZmlnKTsKCiAgICAgICAgICAgIC8vIFNlbmQgbm90aWZpY2F0aW9uIGVtYWlsCiAgICAgICAgICAgICRzdWJqZWN0ID0gIlBhc3N3b3JkIENoYW5nZWQgU3VjY2Vzc2Z1bGx5IjsKICAgICAgICAgICAgJGJvZHkgPSAiRGVhciAkdXNlcm5hbWUsPGJyPjxicj5Zb3VyIHBhc3N3b3JkIGhhcyBiZWVuIHN1Y2Nlc3NmdWxseSBjaGFuZ2VkLjxicj48YnI+UmVnYXJkcyw8YnI+WW91ciBBcHAgTmFtZSI7CiAgICAgICAgICAgIHNlbmRFbWFpbE5vdGlmaWNhdGlvbigkZW1haWwsICRzdWJqZWN0LCAkYm9keSk7CgogICAgICAgICAgICByZXR1cm4gIlBhc3N3b3JkIGNoYW5nZWQgc3VjY2Vzc2Z1bGx5ISI7CiAgICAgICAgfSBlbHNlIHsKICAgICAgICAgICAgcmV0dXJuICJFcnJvcjogIiAuICRzdG10LT5lcnJvcjsKICAgICAgICB9CgogICAgICAgIC8vIENsb3NlIHRoZSBzdGF0ZW1lbnQKICAgICAgICAkc3RtdC0+Y2xvc2UoKTsKICAgIH0KCi8vIEZ1bmN0aW9uIHRvIGhhbmRsZSB1bmtub3duIGxvZ2luIG5vdGlmaWNhdGlvbgogICAgZnVuY3Rpb24gbm90aWZ5VW5rbm93bkxvZ2luKCR1c2VybmFtZSwgJGNvbm4sICRjb25maWcpIHsKICAgICAgICAvLyBTYW5pdGl6ZSBpbnB1dAogICAgICAgICR1c2VybmFtZSA9IHNhbml0aXplSW5wdXQoJHVzZXJuYW1lKTsKCiAgICAgICAgLy8gR2V0IHVzZXIncyBlbmNyeXB0ZWQgZW1haWwKICAgICAgICAkc3RtdCA9ICRjb25uLT5wcmVwYXJlKCJTRUxFQ1QgZW1haWwgRlJPTSB1c2VycyBXSEVSRSB1c2VybmFtZSA9ID8iKTsKICAgICAgICAkc3RtdC0+YmluZF9wYXJhbSgicyIsICR1c2VybmFtZSk7CiAgICAgICAgJHN0bXQtPmV4ZWN1dGUoKTsKICAgICAgICAkc3RtdC0+YmluZF9yZXN1bHQoJGVuY3J5cHRlZEVtYWlsKTsKICAgICAgICAkc3RtdC0+ZmV0Y2goKTsKICAgICAgICAkc3RtdC0+Y2xvc2UoKTsKCiAgICAgICAgLy8gRGVjcnlwdCBlbWFpbAogICAgICAgICRlbWFpbCA9IGRlY3J5cHREYXRhKCRlbmNyeXB0ZWRFbWFpbCwgJGNvbmZpZyk7CgogICAgICAgIC8vIFNlbmQgbm90aWZpY2F0aW9uIGVtYWlsCiAgICAgICAgJHN1YmplY3QgPSAiVW5rbm93biBMb2dpbiBBdHRlbXB0IjsKICAgICAgICAkYm9keSA9ICJEZWFyICR1c2VybmFtZSw8YnI+PGJyPldlIGRldGVjdGVkIGEgbG9naW4gYXR0ZW1wdCBmcm9tIGFuIHVua25vd24gbG9jYXRpb24uPGJyPjxicj5SZWdhcmRzLDxicj5Zb3VyIEFwcCBOYW1lIjsKICAgICAgICBzZW5kRW1haWxOb3RpZmljYXRpb24oJGVtYWlsLCAkc3ViamVjdCwgJGJvZHkpOwoKICAgICAgICByZXR1cm4gIlVua25vd24gbG9naW4gbm90aWZpY2F0aW9uIHNlbnQhIjsKICAgIH0KCi8vIEZ1bmN0aW9uIHRvIHZlcmlmeSBUT1RQIGNvZGUKICAgIGZ1bmN0aW9uIHZlcmlmeVRvdHAoJHVzZXJuYW1lLCAkdG90cENvZGUsICRjb25uKSB7CiAgICAgICAgLy8gR2V0IHVzZXIncyBUT1RQIHNlY3JldAogICAgICAgICRzdG10ID0gJGNvbm4tPnByZXBhcmUoIlNFTEVDVCB0b3RwX3NlY3JldCBGUk9NIHVzZXJzIFdIRVJFIHVzZXJuYW1lID0gPyIpOwogICAgICAgICRzdG10LT5iaW5kX3BhcmFtKCJzIiwgJHVzZXJuYW1lKTsKICAgICAgICAkc3RtdC0+ZXhlY3V0ZSgpOwogICAgICAgICRzdG10LT5iaW5kX3Jlc3VsdCgkc2VjcmV0KTsKICAgICAgICAkc3RtdC0+ZmV0Y2goKTsKICAgICAgICAkc3RtdC0+Y2xvc2UoKTsKCiAgICAgICAgLy8gVmVyaWZ5IHRoZSBUT1RQIGNvZGUKICAgICAgICAkdG90cCA9IFRPVFA6OmNyZWF0ZSgkc2VjcmV0KTsKICAgICAgICByZXR1cm4gJHRvdHAtPnZlcmlmeSgkdG90cENvZGUpOwogICAgfQoKLy8gRnVuY3Rpb24gdG8gaGFuZGxlIHVzZXIgbG9naW4KICAgIGZ1bmN0aW9uIGxvZ2luKCR1c2VybmFtZSwgJHBhc3N3b3JkLCAkdG90cENvZGUsICRjb25uLCAkY29uZmlnKSB7CiAgICAgICAgLy8gU2FuaXRpemUgaW5wdXQKICAgICAgICAkdXNlcm5hbWUgPSBzYW5pdGl6ZUlucHV0KCR1c2VybmFtZSk7CiAgICAgICAgJHBhc3N3b3JkID0gc2FuaXRpemVJbnB1dCgkcGFzc3dvcmQpOwoKICAgICAgICAvLyBQcmVwYXJlIGFuZCBiaW5kCiAgICAgICAgJHN0bXQgPSAkY29ubi0+cHJlcGFyZSgiU0VMRUNUIHBhc3N3b3JkLCB0b3RwX3NlY3JldCBGUk9NIHVzZXJzIFdIRVJFIHVzZXJuYW1lID0gPyIpOwogICAgICAgICRzdG10LT5iaW5kX3BhcmFtKCJzIiwgJHVzZXJuYW1lKTsKICAgICAgICAkc3RtdC0+ZXhlY3V0ZSgpOwogICAgICAgICRzdG10LT5iaW5kX3Jlc3VsdCgkaGFzaGVkUGFzc3dvcmQsICRzZWNyZXQpOwogICAgICAgICRzdG10LT5mZXRjaCgpOwogICAgICAgICRzdG10LT5jbG9zZSgpOwoKICAgICAgICAvLyBWZXJpZnkgcGFzc3dvcmQKICAgICAgICBpZiAocGFzc3dvcmRfdmVyaWZ5KCRwYXNzd29yZCwgJGhhc2hlZFBhc3N3b3JkKSkgewogICAgICAgICAgICAvLyBWZXJpZnkgVE9UUCBjb2RlCiAgICAgICAgICAgICR0b3RwID0gVE9UUDo6Y3JlYXRlKCRzZWNyZXQpOwogICAgICAgICAgICBpZiAoJHRvdHAtPnZlcmlmeSgkdG90cENvZGUpKSB7CiAgICAgICAgICAgICAgICByZXR1cm4gIkxvZ2luIHN1Y2Nlc3NmdWwhIjsKICAgICAgICAgICAgfSBlbHNlIHsKICAgICAgICAgICAgICAgIG5vdGlmeVVua25vd25Mb2dpbigkdXNlcm5hbWUsICRjb25uLCAkY29uZmlnKTsKICAgICAgICAgICAgICAgIHJldHVybiAiSW52YWxpZCBUT1RQIGNvZGUuIjsKICAgICAgICAgICAgfQogICAgICAgIH0gZWxzZSB7CiAgICAgICAgICAgIHJldHVybiAiSW52YWxpZCB1c2VybmFtZSBvciBwYXNzd29yZC4iOwogICAgICAgIH0KICAgIH0KfQ=="
  }
}
repo_files = decode_json_object(encoded_repo)

In [5]:
repo_files

{'index.php': "<?php\nrequire __DIR__ . '/../vendor/autoload.php';\n\nuse Dotenv\\Dotenv;\n\n// Load environment variables\n$dotenv = Dotenv::createImmutable(__DIR__ . '/../');\n$dotenv->load();\n\n// Include application logic\nrequire __DIR__ . '/../src/App.php';\n\n$app = new App();\n$app->run();",
 'app.php': '<?php\n\nclass App\n{\n    private $db;\n\n    // Configuration settings\n    private $config = [\n        \'encryption\' => [\n            \'algorithm\' => \'AES-256-CBC\',\n            \'key\' => \'your-encryption-key\', // Change to your actual encryption key (32 bytes for AES-256)\n            \'iv_length\' => 16\n        ],\n        \'hashing\' => [\n            \'algorithm\' => PASSWORD_DEFAULT,\n            \'options\' => [\'cost\' => 12]\n        ],\n        \'random\' => [\n            \'guid_length\' => 16,\n            \'file_name_length\' => 16,\n            \'string_length\' => 16\n        ]\n    ];\n\n    public function __construct()\n    {\n        $this->conne

In [10]:
# read in owasp csv file
section_result = {}
owasp_df = pd.read_csv('OWASP Controls - Application Security.csv')
owasp_df = owasp_df[~owasp_df['req_description'].str.contains('\[DELETED')]
# Remove the part between square brackets and parentheses
owasp_df['req_description'] = owasp_df['req_description'].str.replace(r'\s*\([^)]*\)', '', regex=True)
def background_code_matching(repo_files, repo_id):
    #job_id = self.request.id
    repo_id = str(repo_id)
    build_embeddings(repo_files, repo_id)
    for section in owasp_df['section_name'].unique():
        reqs_list = owasp_df[owasp_df['section_name'] == section]['req_description']
        req_str = " ".join(reqs_list)
        query = req_str
        depth = get_total_files(repo_id)
        results_ada = query_top_files(query, depth, repo_id)
        results_specter = query_top_files_specter(query, depth, repo_id)
        
        common_files_with_avg_score = get_common_files_with_avg_score(results_ada, results_specter)
        unique_model = get_unique_files(results_ada, results_specter)
        result_dict = {
            'common_files': common_files_with_avg_score,
            'only_one_model': unique_model}
        section_result[section] = result_dict
    return section_result
    # initiate_job(job_id, results)

In [11]:
background_code_matching(repo_files, repo_id)

{'Secure Software Development Lifecycle': {'common_files': [('app.php',
    0.6994010155041531),
   ('index.php', 0.7275098580825595)],
  'only_one_model': []},
 'Authentication Architecture': {'common_files': [('app.php',
    0.7562822886757133),
   ('index.php', 0.7647723871922311)],
  'only_one_model': []},
 'Access Control Architecture': {'common_files': [('index.php',
    0.7478174860591642),
   ('app.php', 0.7351924206020044)],
  'only_one_model': []},
 'Input and Output Architecture': {'common_files': [('index.php',
    0.757057858936053),
   ('app.php', 0.7263264164592933)],
  'only_one_model': []},
 'Cryptographic Architecture': {'common_files': [('app.php',
    0.742070070309879),
   ('index.php', 0.7435591548150995)],
  'only_one_model': []},
 'Errors, Logging and Auditing Architecture': {'common_files': [('index.php',
    0.7374242250223157),
   ('app.php', 0.7109302292420963)],
  'only_one_model': []},
 'Data Protection and Privacy Architecture': {'common_files': [('app.ph