In [None]:
!pip install -q -U google-generativeai pydriller

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.4/137.4 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.0/66.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import requests
from pydriller import Repository
import pathlib
import textwrap
import datetime
import google.generativeai as genai
from IPython.display import display
from IPython.display import Markdown

In [None]:
import requests
import time

class GitHubHandler:
    def __init__(self, api_keys):
        self.api_keys = api_keys
        self.current_key_index = 0
        self.rate_limit_wait_time = 60
        self.max_retries = 3
        self.retryable_exceptions = (
            requests.exceptions.RequestException,
            requests.exceptions.ConnectionError,
            requests.exceptions.Timeout,
            requests.exceptions.HTTPError,
        )
        self.reset_times = {}

    def _key_handler(self):
        if len(self.api_keys) == 0:
            print("No API keys available")
            return None

        key = self.api_keys[self.current_key_index]
        self.current_key_index = (self.current_key_index + 1) % len(self.api_keys)
        return key

    def get_rate_limit_info(response):
        rate_limit_info = {
            "limit": int(response.headers["X-RateLimit-Limit"]),
            "remaining": int(response.headers["X-RateLimit-Remaining"]),
            "reset": int(response.headers["X-RateLimit-Reset"]),
        }
        return rate_limit_info

    def get(self, url, headers=None, params=None):
        if headers is None:
            headers = {}

        for attempt in range(1, self.max_retries + 1):
            try:
                headers['Authorization'] = f"token {self._key_handler()}"
                response = requests.get(url, headers=headers, params=params)
                response.raise_for_status()

                rate_limit_info = self.get_rate_limit_info(response)
                print(f"Rate limit info: {rate_limit_info}")
                if rate_limit_info['remaining'] == 0:
                    print(f"Rate limit reached. Waiting for {self.rate_limit_wait_time} seconds...")
                    time.sleep(self.rate_limit_wait_time)
                    continue

                if "GitHub-Authentication-Token-Expiration" in response.headers:
                    expiration_time = int(response.headers["GitHub-Authentication-Token-Expiration"])
                    self.reset_times[self.api_keys[self.current_key_index - 1]] = expiration_time
                    print(f"Key expires at {expiration_time}")

                current_key = self.api_keys[self.current_key_index - 1]
                if current_key in self.reset_times and time.time() > self.reset_times[current_key]:
                    del self.api_keys[self.current_key_index - 1]
                    del self.reset_times[current_key]
                    print(f"Key {current_key} expired. Removing.")
                    continue

                return response
            except self.retryable_exceptions as e:
                if attempt == self.max_retries:
                    print(f"Attempt {attempt}/{self.max_retries}: {e.__class__.__name__}")
                    if response:
                        print(f"Error details: {response.text}")
                    raise
                else:
                    print(f"Attempt {attempt}/{self.max_retries}: {e.__class__.__name__} occurred. Retrying...")

        return None

# Replace with your actual URL and API keys
url = "https://api.github.com/some/endpoint"
api_keys = [""]

data = GitHubHandler(api_keys).get(url)
print(data)


Attempt 1/3: HTTPError occurred. Retrying...
Attempt 2/3: HTTPError occurred. Retrying...
Attempt 3/3: HTTPError


HTTPError: 404 Client Error: Not Found for url: https://api.github.com/some/endpoint

In [None]:
import requests
from pydriller import Repository

class GithubIssueFetcher:
    def __init__(self, api_key):
        self.api_key = api_key
        self.base_url = "https://api.github.com"

    def fetch_issues(self, owner, repo):
        url = f"{self.base_url}/repos/{owner}/{repo}/issues"
        headers = {"Authorization": f"token {self.api_key}"}
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Failed to fetch issues: {response.status_code}")
            return None

    def fetch_commits(self, owner, repo):
        commits = []
        for commit in Repository(f"https://github.com/{owner}/{repo}").traverse_commits():
            commits.append(commit)
        return commits

# Example usage:
if __name__ == "__main__":
    api_key = ""
    owner = "lcompilers"
    repo = "lpython"

    fetcher = GithubIssueFetcher(api_key)
    issues = fetcher.fetch_issues(owner, repo)
    if issues:
        print("Issues:")
        for issue in issues:
            print(f"Issue #{issue['number']}: {issue['title']}")

    commits = fetcher.fetch_commits(owner, repo)
    print("\nCommits:")
    for commit in commits:
        print(f"Commit: {commit.hash} by {commit.author.name} - {commit.msg}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Commit: 2b3358c3c6814b5a51a69cf81a7cdb0599c43a81 by Thirumalai-Shaktivel - Recognize type_ignore within the dictionaries
Commit: 4526b5d76ca87b0c7a322daf0825851fdf2fbf94 by Thirumalai-Shaktivel - Add tests and update the refs.
Commit: 11531bfe91df4614d594d6cd440886f959d2f7a7 by Thirumalai Shaktivel - Merge pull request #915 from Thirumalai-Shaktivel/fix_dict
Commit: 46a4d007255f6f9824a1cfa05bce5f2a5ab4316c by Naman Gera - Merge pull request #912 from namannimmo10/ifexp

Implement `IfExp` visitor in the LLVM backend
Commit: f68c56c292f7b554712015edf7a1f58e63c37410 by Thirumalai Shaktivel - Merge pull request #913 from Thirumalai-Shaktivel/run_tests
Commit: 527ba7eb4799c706b4c5630075575fa579266d9e by Ondřej Čertík - Merge pull request #914 from Smit-create/i-883

Implement for loop with slice/subscript objects
Commit: c4b1224d150a910c3d7aa1f34556f220e7fef17a by Ondřej Čertík - Port ASR from LFortran
Commit: 7472e7c26741ad17

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
print(type(issues))

<class 'NoneType'>


In [None]:
# utils funcs
def json_dump(data, filename):
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)

## AIModel

In [None]:
import requests
import time
import json

class GitHubHandler:
    def __init__(self, api_keys):
        self.api_keys = api_keys
        self.current_key_index = 0
        self.rate_limit_wait_time = 60
        self.max_retries = 3
        self.retryable_exceptions = (
            requests.exceptions.RequestException,
            requests.exceptions.ConnectionError,
            requests.exceptions.Timeout,
            requests.exceptions.HTTPError,
        )
        self.reset_times = {}

    def _key_handler(self):
        if len(self.api_keys) == 0:
            print("No API keys available")
            return None

        key = self.api_keys[self.current_key_index]
        self.current_key_index = (self.current_key_index + 1) % len(self.api_keys)
        return key

    def get_rate_limit_info(self, response):
        rate_limit_info = {
            "limit": int(response.headers["X-RateLimit-Limit"]),
            "remaining": int(response.headers["X-RateLimit-Remaining"]),
            "reset": int(response.headers["X-RateLimit-Reset"]),
        }
        return rate_limit_info

    def get(self, url, headers=None, params=None):
        if headers is None:
            headers = {}

        for attempt in range(1, self.max_retries + 1):
            try:
                headers["Authorization"] = f"token {self._key_handler()}"
                response = requests.get(url, headers=headers, params=params)
                response.raise_for_status()

                rate_limit_info = self.get_rate_limit_info(response)
                print(f"Rate limit info: {rate_limit_info},key used: {headers['Authorization']}")
                if rate_limit_info["remaining"] == 0:
                    print(
                        f"Rate limit reached. Waiting for {self.rate_limit_wait_time} seconds..."
                    )
                    time.sleep(self.rate_limit_wait_time)
                    continue

                if "GitHub-Authentication-Token-Expiration" in response.headers:
                    expiration_time = int(
                        response.headers["GitHub-Authentication-Token-Expiration"]
                    )
                    self.reset_times[
                        self.api_keys[
                            (self.current_key_index - 1 + len(self.api_keys))
                            % len(self.api_keys)
                        ]
                    ] = expiration_time
                    print(f"Key expires at {expiration_time}")

                current_key = self.api_keys[self.current_key_index - 1]
                if (
                    current_key in self.reset_times
                    and time.time() > self.reset_times[current_key]
                ):
                    del self.api_keys[self.current_key_index - 1]
                    del self.reset_times[current_key]
                    print(f"Key {current_key} expired. Removing.")
                    continue

                print("headers",headers)
                # json.dump(response.json(), open("data1.json", "w"))
                return response.json()

            except self.retryable_exceptions as e:
                if attempt == self.max_retries:
                    print(
                        f"Attempt {attempt}/{self.max_retries}: {e.__class__.__name__}"
                    )
                    if response:
                        print(f"Error details: {response.text}")
                    # raise
                else:
                    print(
                        f"Attempt {attempt}/{self.max_retries}: {e.__class__.__name__} occurred. Retrying..."
                    )

        return None


# Replace with your actual URL and API keys
url = "https://api.github.com/repos/lcompilers/lpython/issues"
api_keys = []
gitobj = GitHubHandler(api_keys)
while True:
    data = gitobj.get(url)
    print(data)

Rate limit info: {'limit': 60, 'remaining': 12, 'reset': 1710436269},key used: token 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
headers {'Authorization': 'token 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'}
Rate limit info: {'limit': 60, 'remaining': 12, 'reset': 1710436270},key used: token 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
headers {'Authorization': 'token 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'}
Rate limit info: {'limit': 60, 'remaining': 12, 'reset': 1710436270},key used: token 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
headers {'Authorization': 'token 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'}
Rate limit info: {'limit': 60, 'remaining': 11, 'reset': 1710436269},key used: token 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
headers {'Authorization': 'token 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'}
Rate limit info: {'limit': 60, 'remaining': 11, 'reset': 1710436270},key used: token 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
headers {'Authorization': 'token ghp_13tq99SDiVSHslYfC9PhvXovUr9RG

KeyboardInterrupt: 

#Final Github Handler

In [None]:
import requests
from pydriller import Repository
import os

class GithubFetcher:
    def __init__(self, owner, repo, api_keys):
        self.api_keys = api_keys
        self.current_key_index = 0
        self.rate_limit_wait_time = 60
        self.max_retries = 3
        self.retryable_exceptions = (
            requests.exceptions.RequestException,
            requests.exceptions.ConnectionError,
            requests.exceptions.Timeout,
            requests.exceptions.HTTPError,
        )
        self.reset_times = {}
        self.base_url = "https://api.github.com"
        self.owner = owner
        self.repo = repo
        self.issues = []
        self.commits = []

    def _key_handler(self):
        if len(self.api_keys) == 0:
            print("No API keys available")
            return None

        key = self.api_keys[self.current_key_index]
        self.current_key_index = (self.current_key_index + 1) % len(self.api_keys)
        return key

    def get_rate_limit_info(self, response):
        rate_limit_info = {
            "limit": int(response.headers["X-RateLimit-Limit"]),
            "remaining": int(response.headers["X-RateLimit-Remaining"]),
            "reset": int(response.headers["X-RateLimit-Reset"]),
        }
        return rate_limit_info

    def get(self, url, headers=None, params=None):
        if headers is None:
            headers = {}

        for attempt in range(1, self.max_retries + 1):
            try:
                headers["Authorization"] = f"token {self._key_handler()}"
                response = requests.get(url, headers=headers, params=params)
                response.raise_for_status()

                rate_limit_info = self.get_rate_limit_info(response)
                print(f"Rate limit info: {rate_limit_info},key used: {headers['Authorization']}")
                if rate_limit_info["remaining"] == 0:
                    print(
                        f"Rate limit reached. Waiting for {self.rate_limit_wait_time} seconds..."
                    )
                    time.sleep(self.rate_limit_wait_time)
                    continue

                if "GitHub-Authentication-Token-Expiration" in response.headers:
                    expiration_time = int(
                        response.headers["GitHub-Authentication-Token-Expiration"]
                    )
                    self.reset_times[
                        self.api_keys[
                            (self.current_key_index - 1 + len(self.api_keys))
                            % len(self.api_keys)
                        ]
                    ] = expiration_time
                    print(f"Key expires at {expiration_time}")

                current_key = self.api_keys[self.current_key_index - 1]
                if (
                    current_key in self.reset_times
                    and time.time() > self.reset_times[current_key]
                ):
                    del self.api_keys[self.current_key_index - 1]
                    del self.reset_times[current_key]
                    print(f"Key {current_key} expired. Removing.")
                    continue

                print("headers",headers)
                # json.dump(response.json(), open("data1.json", "w"))
                return response.json()

            except self.retryable_exceptions as e:
                if attempt == self.max_retries:
                    print(
                        f"Attempt {attempt}/{self.max_retries}: {e.__class__.__name__}"
                    )
                    if response:
                        print(f"Error details: {response.text}")
                    # raise
                else:
                    print(
                        f"Attempt {attempt}/{self.max_retries}: {e.__class__.__name__} occurred. Retrying..."
                    )

        return None

    def get_issues(self,params):
        url = f"{self.base_url}/repos/{self.owner}/{self.repo}/issues"
        headers = {"Authorization": f"token {self._key_handler()}"}
        response = requests.get(url, headers=headers,params = params)
        if response.status_code == 200:
            self.issues = response.json()
            self.save_issues(self.issues)
            return self.issues
        else:
            print(f"Failed to fetch issues: {response.status_code}")
            return []

    def get_commits(self, from_date=None, to_date=None):
        ret = []
        if from_date is None:
            self.commits = list(Repository(f"https://github.com/{self.owner}/{self.repo}").traverse_commits())
            ret = self.commits
            self.save_commits(self.commits)
        else:
            ret = list(Repository(f"https://github.com/{self.owner}/{self.repo}", since=from_date, to=to_date).traverse_commits())
        return ret

    def save_issues(self, issues):
        folder_path = f"{self.owner}/{self.repo}/issues"
        os.makedirs(folder_path, exist_ok=True)
        with open(f"{folder_path}/issues.txt", "w") as f:
            for issue in issues:
                f.write(f"Issue #{issue['number']}: {issue['title']}\n")

    def save_commits(self, commits):
        folder_path = f"{self.owner}/{self.repo}/commits"
        os.makedirs(folder_path, exist_ok=True)
        with open(f"{folder_path}/commits.txt", "w") as f:
            for commit in commits:
                f.write(f"Commit: {commit.hash} by {commit.author.name} - {commit.msg}\n")



# Example usage:
if __name__ == "__main__":
    api_keys = [""]
    owner = "lcompilers"
    repo = "lpython"

    fetcher = GithubFetcher(owner, repo, api_keys)
    issues = fetcher.get_issues(None)
    if issues:
        print("Issues:")
        for issue in issues:
            print(f"Issue #{issue['number']}: {issue['title']}")

    commits = fetcher.get_commits()


Failed to fetch issues: 401


#Abstract AI model

In [None]:
from abc import ABC, abstractmethod
from IPython.display import display
from IPython.display import Markdown
import os
import google.generativeai as genai
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))


class AIModel:
  @abstractmethod
  def train(self):
    pass

  @abstractmethod
  def prompt(self,text):
    pass



def save_text_to_file(file_path, text):
    """Saves text to a file, creating the directory if necessary."""

    os.makedirs(os.path.dirname(file_path), exist_ok=True)  # Create directory if needed

    with open(file_path, "w") as file:
        file.write(text)

# # Example usage:
# file_path = "path/to/your/file.txt"  # Replace with the desired file path
# text = "This is the text to be saved."

# save_text_to_file(file_path, text)

# print("Text saved successfully!")





class OpenAIModel(AIModel):
    def __init__(self,keys,model_type):
      self.keys = keys
      self.model_type = model_type
    def train(self):
      print('Nothing')
    def prompt(self,text):
      OPENAI_API_KEY0 = self.keys[0]
      client = OpenAI(api_key=OPENAI_API_KEY0)
      completion = client.chat.completions.create(
      model=self.model_type,
      messages=[
          {"role": "system", "content": text['prompt1']},
          {"role": "user", "content": text['prompt2']}
      ]
      )
      answer = completion.choices[0].message.content
      # save_text_to_file('/prompts.txt',text[prompt2].encode())
      # save_text_to_file('/outputs.txt',answer)
      return answer


class GeminiAIModel(AIModel):
  def __init__(self,keys,model_type,retention=True):
    self.keys = keys
    self.model_type = model_type
    self.retention = retention
    self.models=[]
    self.iter = 0
    for i in self.keys:
      genai.configure(api_key=i)
      model = genai.GenerativeModel(self.model_type)
      chat = model.start_chat(history=[]) if self.retention  else  model.generate_content()
      self.models.append(chat)

  def train(self,one_shot_prompt):
    index = 0
    for i in self.models:
      i.send_message(one_shot_prompt)
      print(index)
      index += 1

  def prompt(self,text):
    response = self.models[self.iter % len(self.models)].send_message(text)
    self.iter += 1
    # save_text_to_file('/prompts.txt',text.encode())
    # save_text_to_file('/outputs.txt',response.text)
    return response.text

#One PipeLine

In [None]:
from datetime import datetime
import requests
import json

In [None]:
GitHubHandler = GithubFetcher("shosetsuorg","shosetsu", [""])
issues = GitHubHandler.get_issues({"state":"closed", "per_page":"100"})
print(issues)



In [None]:
import pydriller
from pydriller import *

In [None]:
def issue_date_generator(issue, type):
  issue_managed_at = issue[type]
  issue_managed_at = issue_managed_at.split('-')
  issue_managed_at[2] = issue_managed_at[2].split('T')
  return [int(issue_managed_at[0]), int(issue_managed_at[1]), int(issue_managed_at[2][0])]


In [None]:
# hyper parameters = i,j,num
# required variables = issue_commit_map, CommitDetails
# ["AIzaSyDWrUrF3so_PFsu6TAJnuzJq3wdaXrNCDo","AIzaSyByDnbCoLo-gRZhvOBLhgubdmOd6Ydmw_I","AIzaSyDfpsjvYJNsIdfQcscvTQGZiyDlXCRoO68","AIzaSyABHudVPS2ahdXtrkDPXfYpq9KtXwzGzlQ", "AIzaSyBPes3Qot8ZqLb7hm1je4kcbSdK6B1SXCU"]
GeminiModel = GeminiAIModel([""], 'gemini-pro')
GeminiModel.train("You will receive two objects. One is the lines of code of a method before changes due to a commit and second one is also some lines of code of a method after changes, your job is to give me a one line summarization of what is altered and the meaning of it based on the before and after the method changes, only one line summarization")


0
1


In [None]:
def compare_strings(str1, str2):
  """
  Compares two strings of lines and returns a list of tuples containing information about changed lines in both strings.

  Args:
      str1: The first string to compare.
      str2: The second string to compare.

  Returns:
      A list of tuples containing information about changed lines in both strings.
  """
  # Split the strings into lists of lines
  lines1=""
  lines2=""

  if(str1!=None): lines1 = str1.splitlines()
  if(str2!=None): lines2 = str2.splitlines()

  # Find the longest list of lines
  max_len = max(len(lines1), len(lines2))

  # Initialize lists to store information about changed lines
  changed_lines_in_str1 = []
  changed_lines_in_str2 = []

  # Iterate through the lines
  for i in range(max_len):
    # Check if the lines are equal
    if i < len(lines1) and i < len(lines2) and lines1[i] == lines2[i]:
      continue
    # If lines are not equal, add info about changed lines in str1 and str2
    line_num = i + 1
    if i < len(lines1):
      line1_content = lines1[i]
    else:
      line1_content = ""
    if i < len(lines2):
      line2_content = lines2[i]
    else:
      line2_content = ""
    if line1_content != "":
      # changed_lines_in_str1.append(f"Line {line_num} in str1: {line1_content}")
      changed_lines_in_str1.append((line_num, line1_content))
    if line2_content != "":
      # changed_lines_in_str2.append(f"Line {line_num} in str2: {line2_content}")
      changed_lines_in_str2.append((line_num, line2_content))

  return changed_lines_in_str1, changed_lines_in_str2

# # Example usage
# str1 = """Line 1
# Line 2
# Same line
# Line 4"""

# str2 = """Line 1
# Changed line
# Same line
# New line"""

# lines_changed_in_str1, lines_changed_in_str2 = compare_strings(str1, str2)

# if lines_changed_in_str1:
#   print("Lines changed in str1:")
#   for line in lines_changed_in_str1:
#     print(line)

# if lines_changed_in_str2:
#   print("Lines changed in str2:")
#   for line in lines_changed_in_str2:
#     print(line)

# if not lines_changed_in_str1 and not lines_changed_in_str2:
#   print("No lines changed.")

# def compare_strings(string1, string2):
#     lines_string1=""
#     lines_string2=""
#     if(string1!=None): lines_string1 = string1.split('\n')
#     if(string2!=None): lines_string2 = string2.split('\n')

#     changes_before = []
#     changes_after = []

#     for line_num, (line1, line2) in enumerate(zip(lines_string1, lines_string2), start=1):
#         if line1.strip() != line2.strip():
#             changes_before.append((line_num, line1.strip()))
#             changes_after.append((line_num, line2.strip()))

#     print("COMAPRED STRINGS",changes_after)

#     return changes_before, changes_after

# if _name_ == "_main_":
#     string1 = """Your first long string here"""
#     string2 = """Your second long string here"""

#     changes_before, changes_after = compare_strings(string1, string2)

#     print("Lines before the change:")
#     for line_num, line_content in changes_before:
#         print(f"Line {line_num}: {line_content}")

#     print("\nLines after the change:")
#     for line_num, line_content in changes_after:
#         print(f"Line {line_num}: {line_content}")

def _getMethodBody(method, source_code, file):
    """
    Given a method, it returns the body of the method.
    :param method: the method
    :param source_code: the source code
    :param file: the file
    :return: the body of the method
    """
    if method and source_code:
        lines = source_code.split("\n")
        start = method.start_line
        end = method.end_line
        method_body = "\n".join(lines[start - 1 : end])
        return method_body
    return None

In [None]:
Commits = pydriller.Repository('https://github.com/shosetsuorg/shosetsu').traverse_commits()

In [None]:
from pydriller import Repository

# Define the path to the Git repository
repo_path = 'https://github.com/shosetsuorg/shosetsu'

# Create an empty hashmap to store commit information
commit_hashmap = {}



# Iterate over all commits in the repository
for commit in Repository(repo_path).traverse_commits():
    # Retrieve commit information
  modified_files=[]
  _changes_before = ""
  _changes_after = ""
  if(len(commit_hashmap)>1): break
  for m in commit.modified_files:
      modified_files.append(m)

  for file in modified_files:

    # checking...
    # num = num + 1
    # print(num)

    # iterating over all modified methods in those modified files
    for method in file.changed_methods:

        # getting whole method before and after
        method_before = next((x for x in file.methods_before if x == method), None)
        # print(method_before)
        method_after = next((x for x in file.methods if x == method), None)
        # print(method_after)


        #  getting method body before and after
        body_before = _getMethodBody(method_before, file.source_code_before, file)
        body_after = _getMethodBody(method_after, file.source_code, file)

        changes_before = ""
        changes_after = ""

        # print("DODIES BEFORE\n", body_before,"DODIES AFTER\n", body_after)

        # getting only the changes of method body if they are changed only
        # if body_before == None or body_after == None:
        #   pass
        # else:
        changes_before, changes_after = compare_strings(body_before, body_after)
        # changes_before, changes_after = compare_strings(body_before, body_after)

        # getting changes before and after in a single string format
        print("CAHNGES", changes_before,"AFTERSDF", changes_after)

        for line_num, line_content in changes_before:
            _changes_before = _changes_before + "Line" + str(line_num) + ": " + str(line_content) + "\n"
        for line_num, line_content in changes_after:
            _changes_after = _changes_after + "Line" + str(line_num) + ": " + str(line_content) + "\n"

    commit_hash = commit.hash
    commit_data = {
        'author_name': commit.author.name,
        'author_email': commit.author.email,
        'committer_name': commit.committer.name,
        'committer_email': commit.committer.email,
        'message': commit.msg,
        # 'files_modified': [modification.filename for modification in commit.modifications],
        'changes_before': _changes_before,
        'changes_after': _changes_after,
        # You can add more information as needed
    }
    # Add commit information to the hashmap
    commit_hashmap[commit_hash] = commit_data

# Print the commit hashmap (you can perform further operations with it)
for commit_hash, commit_data in commit_hashmap.items():
    print(f'Commit Hash: {commit_hash}')
    print(f'Commit Data: {commit_data}')
    print('-' * 50)


CAHNGES [] AFTERSDF [(1, '    public void useAppContext() {'), (2, '        // Context of the app under test.'), (3, '        Context appContext = InstrumentationRegistry.getTargetContext();'), (5, '        assertEquals("com.github.Doomsdayrs.apps", appContext.getPackageName());'), (6, '    }')]
CAHNGES [] AFTERSDF [(1, '    public boolean onOptionsItemSelected(MenuItem item) {'), (2, '        // Handle action bar item clicks here. The action bar will'), (3, '        // automatically handle clicks on the Home/Up button, so long'), (4, '        // as you specify a parent activity in AndroidManifest.xml.'), (5, '        int id = item.getItemId();'), (7, '        //noinspection SimplifiableIfStatement'), (8, '        if (id == R.id.action_search) {'), (9, '            return true;'), (10, '        }'), (12, '        return super.onOptionsItemSelected(item);'), (13, '    }')]
CAHNGES [] AFTERSDF [(1, '    public boolean onCreateOptionsMenu(Menu menu) {'), (2, '        // Inflate the menu; 

# Pipeline 2 using Open AI

In [None]:
issue_commit_map = []
j = 0
for issue in issues:
    # checking...
    j = j + 1
    if len(issue_commit_map) > 1: break
    if j <= 5: continue
    # print("j: ", j)
    # if j==2: break
    # date generation
    # print(issue["created_at"])
    # print(issue["closed_at"])
    issue_created_at = issue_date_generator(issue, "created_at")
    issue_closed_at = issue_date_generator(issue, "closed_at")
    dt1 = datetime(issue_created_at[0], issue_created_at[1], issue_created_at[2], 0, 0, 0)
    dt2 = datetime(issue_closed_at[0], issue_closed_at[1], issue_closed_at[2], 23, 59, 0)
    # print(dt1,dt2)
    # fetching commits
    Commits = pydriller.Repository('https://github.com/shosetsuorg/shosetsu', since=dt1, to=dt2).traverse_commits()
    i = 0
    num = 0
    CommitDetails = []
    print(Commits)
    # iterating over all commits
    for commit in Commits:

        print(commit.msg)
        modified_files = []

        # checking...
        i = i + 1
        if(i == 4): break
        num = 0
        method_info = ""
        # get all modified files in that particular commit
        for m in commit.modified_files:
            modified_files.append(m)
        method_changes = []

        # iterating over all  modified files
        print(modified_files)
        for file in modified_files:

            # checking...
            num = num + 1
            print("num: ",num)
            iter3 = 0
            # iterating over all modified methods in those modified files
            for method in file.changed_methods:
                iter3 += 1
                # getting whole method before and after
                method_before = next((x for x in file.methods_before if x == method), None)
                print(method_before)
                method_after = next((x for x in file.methods if x == method), None)
                print(method_after)


                #  getting method body before and after
                body_before = _getMethodBody(method_before, file.source_code_before, file)
                body_after = _getMethodBody(method_after, file.source_code, file)

                changes_before = ""
                changes_after = ""

                # getting only the changes of method body if they are changed only
                if body_before == None or body_after == None: pass
                else: changes_before, changes_after =compare_strings(body_before, body_after)


                # getting changes before and after in a single string format
                _changes_before = ""
                _changes_after = ""
                for line_num, line_content in changes_before:
                    _changes_before = _changes_before + "Line" + str(line_num) + ": " + str(line_content) + "\n"
                for line_num, line_content in changes_after:
                    _changes_after = _changes_after + "Line" + str(line_num) + ": " + str(line_content) + "\n"



                print("changes_before: ", _changes_before)
                print("changes_after: ", _changes_after)

                method_changes.append([_changes_before, _changes_after])


                # getting summarization from gemini AI
                prompt = "before changes: " + _changes_before + "after changes: " + _changes_after
                response =  GeminiModel.prompt(prompt)
                method_info = method_info + response

                print("iter: ", iter3)
                if iter3 == 2: break
            print("num: ", num)
            if num == 2: break

        CommitDetails.append({ "index": i, "url" : "", "method_summarization": method_info})
    issue_commit_map.append({"issue_index" : j,"issue_title" : issue["title"], "issue_body" : issue["body"], "CommitDetails" : CommitDetails })

print(issue_commit_map)



<generator object Repository.traverse_commits at 0x7801bd5589e0>
Prevent downloading all when there are no chapters
[<pydriller.domain.commit.ModifiedFile object at 0x7801bf04fa60>]
num:  1
<pydriller.domain.commit.Method object at 0x7801bf5892d0>
<pydriller.domain.commit.Method object at 0x7801bd5141c0>
changes_before:  Line2: 		downloadChapterPassageUseCase(chapters)
Line4: 		if (startManager)
Line5: 			startDownloadWorkerUseCase()
Line6: 	}

changes_after:  Line2: 		if (chapters.isEmpty()) return
Line3: 		downloadChapterPassageUseCase(chapters)
Line5: 		if (startManager)
Line6: 			startDownloadWorkerUseCase()
Line7: 	}

iter:  1
num:  1
Start working on improving UI performance, add @Immutable to UI models
[<pydriller.domain.commit.ModifiedFile object at 0x7801bf04ffd0>, <pydriller.domain.commit.ModifiedFile object at 0x7801bf04fca0>, <pydriller.domain.commit.ModifiedFile object at 0x7801bd5162f0>, <pydriller.domain.commit.ModifiedFile object at 0x7801bd514a00>, <pydriller.domain.co

# Pipeline 3

In [None]:
def get_text_from_issue(issue):
  issue_url = issue['url']
  comments_url = issue['comments_url']
  issue_title = issue['title']
  issue_number = issue['number']
  issue_body = issue['body']
  formatted_issue_info = f"Issue Title: {issue_title}\n" \
                        f"Issue Number: {issue_number}\n" \
                        f"Issue Body: {issue_body}\n" \


  return formatted_issue_info


api_keys = [""]
owner = "shosetsuorg"
repo = "shosetsu"

fetcher = GithubFetcher(owner, repo, api_keys)
commits = fetcher.get_commits()
issues = fetcher.get_issues(None)


In [None]:
issue_data = {}
print("len of issues : ", len(issues))
#getting issue data for each issue as text
for issue in issues:
    print( issue['number'])
    try:
      issue_data[issue['number']] = get_text_from_issue(issue)
    except Exception as e:
      print(f"Error processing issue {issue['number']}: {e}")
      issue_data[issue['number']] = "ignore this"

#now we'll give this issue data of each issue to LLM and ask it to find the methods
prompt = "You are given a Github issue's details, find what methods are changed by this issue, and give the methods names, here is the issue text: "


In [None]:

issue_methods_data_ss = {}# maps issue numbers with methods changed
GeminiModel = GeminiAIModel("", 'gemini-pro')
GeminiModel.train(prompt)
print(GeminiModel.prompt(prompt))
loop_count = 1


**Issue Title:** Update `calculate_cost` method to support new pricing model

**Issue Description:**

The `calculate_cost` method needs to be updated to support the new pricing model. The new model introduces a new pricing tier and changes the pricing for existing tiers.

**Impacted Methods:**

* `calculate_cost`

**Additional Information:**

* The `calculate_cost` method is responsible for calculating the cost of using a service.
* The new pricing model will take effect on March 1, 2023.

**Methods Changed:**

* `calculate_cost`


In [None]:

for issue in issues:
  _issue_content_ = (GeminiModel.prompt("Here is the next issue details: " + issue_data[issue['number']]))
  issue_methods_data_ss[issue['number']] = _issue_content_
  # print(_issue_content_)
  print(issue['number'])
  # break
  loop_count += 1
  if loop_count % 250 == 0:
      print("Waiting for 10 seconds...")
      time.sleep(10)

239
238
233
229
228
227
226
225


ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 1149.57ms


InternalServerError: 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting

In [None]:
issue_time_data={}
for issue in issues:
  issue_time_data[issue['number']]=[issue['created_at'],issue['closed_at']]



issue_commitList = {}

for issue in issues:
  # Format the date according to pydriller
    issue_started_at =issue_date_generator(issue, "created_at")
    issue_start_date = datetime(issue_started_at[0], issue_started_at[1], issue_started_at[2], 23, 59, 0)
    if issue_time_data[issue['number']][1] is None:
        issue_end_date = datetime(3000,1,1,23,59,0)  # Set end date as year 3000
    else:
        issue_closed_at =issue_date_generator(issue, "closed_at")
        issue_end_date = datetime(issue_closed_at[0], issue_closed_at[1], issue_closed_at[2], 23, 59, 0)

    commits_in_range = []
    api_key = ''
    cnt = 0;
    newcommits = fetcher.get_commits(issue_start_date, issue_end_date)
    print(len(newcommits), " Len of newcommtis")
    for commit in newcommits:
        commits_in_range.append(commit)
        cnt += 1;
        if cnt > 200: # Get the first 200 commits in the range
          print("cnt > 200")
          break

    issue_commitList[issue['number']] = commits_in_range

11  Len of newcommtis
30  Len of newcommtis
64  Len of newcommtis
102  Len of newcommtis
102  Len of newcommtis
102  Len of newcommtis
102  Len of newcommtis
102  Len of newcommtis
128  Len of newcommtis
151  Len of newcommtis
163  Len of newcommtis
163  Len of newcommtis
203  Len of newcommtis
cnt > 200
203  Len of newcommtis
cnt > 200
264  Len of newcommtis
cnt > 200
332  Len of newcommtis
cnt > 200
332  Len of newcommtis
cnt > 200
332  Len of newcommtis
cnt > 200
332  Len of newcommtis
cnt > 200
332  Len of newcommtis
cnt > 200
332  Len of newcommtis
cnt > 200
332  Len of newcommtis
cnt > 200
332  Len of newcommtis
cnt > 200
332  Len of newcommtis
cnt > 200
332  Len of newcommtis
cnt > 200
467  Len of newcommtis
cnt > 200
482  Len of newcommtis
cnt > 200
535  Len of newcommtis
cnt > 200
541  Len of newcommtis
cnt > 200
616  Len of newcommtis
cnt > 200


In [None]:
# now we have issue number matched with the commits in the timeframe,
# we'll match the methods in the issues (which we got from gemini) to methods in commits
# issue_methods_data_ss has the methods which are changed in the issue
# issue_commitList has the commits in the timeline of the issue
# for each commit in the timeline, get all methods changed and find if any methods matches if yes the add the commit' sha to the issue
prompt = "I'll give you two arrays, one is methods names changed in issue, which i got from the issue body, and the other is methods names changed in a commit, check if any methods changed in the commit matches with the methods changed in the issue, if yes give the count of how many methods match, else give 0"
issue_commit_matched = {}
GeminiModel = GeminiAIModel("", 'gemini-pro')
GeminiModel.train(prompt)
for issue in issues:
  issue_no = issue['number']
  print(issue_no, "Issue no")
  print("LEN: ", len(issue_commitList[issue_no]))
  for commit in issue_commitList[issue_no]:
    commit_methods = []
    print("hie")
    for modified_file in commit.modified_files:
      # ignore if this file doesn't change any methods
      if len(modified_file.changed_methods) == 0:
                continue
      for method in modified_file.changed_methods:
        commit_methods.append(method.name)

    # now we have the methods changed in this commit (in commit_methods[]) and issue methods in issue_methods[]
    commits_methods_string = json.dumps(commit_methods)
    print(methods_string)
    # break
    issue_methods = issue_methods_data_ss[issue_no]
    issue_methods_string = json.dumps(issue_methods)

    # get LLM response, if response == 0, this ignore, else add this commit' sha to the issue (as this commit is related to the issue)
    out = GeminiModel.prompt("ISSUE METHOD NAMES : " + issue_methods_string + "\nCOMMIT METHOD NAMES: " + commits_methods_string)
    if out != "0" and out != None:
      issue_commit_matched[issue_no].append(commit.sha)
  # break

239 Issue no
LEN:  11
hie


GitCommandError: Cmd('git') failed due to: exit code(128)
  cmdline: git diff-tree e59602ae0e8e795012cc16b0c34952e1b71f302f 7efb7f3894b60e99d948f0a627e6bdbd5e0759a7 -r --abbrev=40 --full-index -M -p --no-color

# Issues with Gemini:
## 1. Limited queries (after 4-5 queries its dying)
-------------------------------------------------------------------------------------------------------------------

ex: ERROR:tornado.access:500 POST /v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 886.96ms


InternalServerError                       Traceback (most recent call last)
<ipython-input-8-14c90875c593> in <cell line: 4>()
      2 GeminiModel = GeminiAIModel("AIzaSyCx9HXvDXEAAnZQ0Rfefi4hhteNcff-GLY", 'gemini-pro')
      3 GeminiModel.train(prompt)
----> 4 print(GeminiModel.prompt(prompt))
      5 loop_count = 1

9 frames
/usr/local/lib/python3.10/dist-packages/google/ai/generativelanguage_v1beta/services/generative_service/transports/rest.py in __call__(self, request, retry, timeout, metadata)
    854             # subclass.
    855             if response.status_code >= 400:
--> 856                 raise core_exceptions.from_http_response(response)
    857
    858             # Return the response

InternalServerError: 500 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting

--------------------------------------------------------------------------------------------------------------------

## 2. Server issue

ex: TimeoutError: timed out

The above exception was the direct cause of the following exception:

ReadTimeoutError                          Traceback (most recent call last)
ReadTimeoutError: HTTPConnectionPool(host='localhost', port=44741): Read timed out. (read timeout=60.0)

During handling of the above exception, another exception occurred:

ReadTimeout                               Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    530                 raise SSLError(e, request=request)
    531             elif isinstance(e, ReadTimeoutError):
--> 532                 raise ReadTimeout(e, request=request)
    533             elif isinstance(e, _InvalidHeader):
    534                 raise InvalidHeader(e, request=request)

ReadTimeout: HTTPConnectionPool(host='localhost', port=44741): Read timed out. (read timeout=60.0)


## Git Error:

GitCommandError                           Traceback (most recent call last)
<ipython-input-42-e5a81febdece> in <cell line: 10>()
     15     commit_methods = []
     16     print("hie")
---> 17     for modified_file in commit.modified_files:
     18       # ignore if this file doesn't change any methods
     19       if len(modified_file.changed_methods) == 0:

5 frames
/usr/local/lib/python3.10/dist-packages/git/cmd.py in wait(self, stderr)
    655                 errstr = read_all_from_possibly_closed_stream(p_stderr)
    656                 _logger.debug("AutoInterrupt wait stderr: %r" % (errstr,))
--> 657                 raise GitCommandError(remove_password_if_present(self.args), status, errstr)
    658             return status
    659

GitCommandError: Cmd('git') failed due to: exit code(128)
  cmdline: git diff-tree e59602ae0e8e795012cc16b0c34952e1b71f302f 7efb7f3894b60e99d948f0a627e6bdbd5e0759a7 -r --abbrev=40 --full-index -M -p --no-colo

Issue Issue Mapping using semantic matching of Issue titles

In [None]:
!pip install gensim





In [None]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import nltk
nltk.download('punkt')

# Sample sentences for training the Word2Vec model (you can replace this with your own data)
sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "A fast fox jumps over a lazy dog.",
    "The lazy cat sits on the mat.",
    "Dogs and cats are both pets.",
    "The sun rises in the east."
]

# Tokenize the sentences
tokenized_sentences = [nltk.word_tokenize(sentence.lower()) for sentence in sentences]

# Train the Word2Vec model
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

def preprocess_text(text):
    """
    Tokenizes and preprocesses the input text.
    """
    return nltk.word_tokenize(text.lower())

def semantic_similarity(str1, str2):
    """
    Calculate semantic similarity between two strings using Word2Vec embeddings.

    Args:
    - str1 (str): First string.
    - str2 (str): Second string.

    Returns:
    - float: Similarity score between 0 and 1.
             1 means the strings are semantically identical,
             0 means they have no semantic similarity.
    """
    # Preprocess the strings
    tokens1 = preprocess_text(str1)
    tokens2 = preprocess_text(str2)

    # Get word vectors
    word_vectors = model.wv

    # Compute cosine similarity between average word vectors of each sentence
    similarity_score = word_vectors.n_similarity(tokens1, tokens2)

    return similarity_score

# Example usage:
string1 = "The quick brown fox jumps over the lazy dog."
string2 = "A fast fox jumps over a lazy dog."
print(string2)
similarity = semantic_similarity(string1, string2)
print(f"Semantic similarity score: {similarity}")


A fast fox jumps over a lazy dog.
Semantic similarity score: 0.5413475036621094


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import nltk
nltk.download('punkt')

# Sample sentences for training the Word2Vec model (you can replace this with your own data)
sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "A fast fox jumps over a lazy dog.",
    "The lazy cat sits on the mat.",
    "Dogs and cats are both pets.",
    "The sun rises in the east."
]

# Tokenize the sentences
tokenized_sentences = [nltk.word_tokenize(sentence.lower()) for sentence in sentences]

# Train the Word2Vec model
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

def preprocess_text(text):
    """
    Tokenizes and preprocesses the input text.
    """
    return nltk.word_tokenize(text.lower())

def semantic_similarity(str1, str2):
    """
    Calculate semantic similarity between two strings using Word2Vec embeddings.

    Args:
    - str1 (str): First string.
    - str2 (str): Second string.

    Returns:
    - float: Similarity score between 0 and 1.
             1 means the strings are semantically identical,
             0 means they have no semantic similarity.
    """
    # Preprocess the strings
    tokens1 = preprocess_text(str1)
    tokens2 = preprocess_text(str2)

    # Get word vectors
    word_vectors = model.wv

    # Compute cosine similarity between average word vectors of each sentence
    similarity_score = word_vectors.n_similarity(tokens1, tokens2)

    return similarity_score

# Example usage:
string1 = "The quick brown fox jumps over the lazy dog."
string2 = "A fast fox jumps over a lazy dog."
similarity = semantic_similarity(string1, string2)
print(f"Semantic similarity score: {similarity}")


Semantic similarity score: 0.5413475036621094


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import math

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

def preprocess(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    return lemmatized_tokens

def get_synsets(tokens):
    synsets = []
    for token in tokens:
        synsets.extend(wn.synsets(token))
    return synsets

def calculate_similarity(synsets1, synsets2):
    max_similarity = 0
    for synset1 in synsets1:
        for synset2 in synsets2:
            similarity = synset1.path_similarity(synset2)
            if similarity:
                max_similarity = max(max_similarity, similarity)
    return max_similarity

def semantic_similarity(sentence1, sentence2):
    tokens1 = preprocess(sentence1)
    tokens2 = preprocess(sentence2)
    synsets1 = get_synsets(tokens1)
    synsets2 = get_synsets(tokens2)
    similarity = calculate_similarity(synsets1, synsets2)
    return similarity

# Example usage
sentence1 = "cannot install numpy on python 3.13"
sentence2 = "a meson bug for win11 "
similarity_score = semantic_similarity(sentence1, sentence2)
print("Semantic similarity score:", similarity_score)


Semantic similarity score: 0.16666666666666666


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def compare_strings(str1, str2):
  """
  Compares two strings of lines and returns a list of lines that are different.

  Args:
      str1: The first string to compare.
      str2: The second string to compare.

  Returns:
      A list of lines that are different between the two strings.
  """
  # Split the strings into lists of lines
  lines1 = str1.splitlines()
  lines2 = str2.splitlines()

  # Find the longest list of lines
  max_len = max(len(lines1), len(lines2))

  # Initialize a list to store the changed lines
  changed_lines = []

  # Iterate through the lines
  for i in range(max_len):
    # Check if the lines are equal
    if i < len(lines1) and i < len(lines2) and lines1[i] == lines2[i]:
      continue
    # If the lines are not equal, add the line number and content to the changed lines list
    line_num = i + 1
    if i < len(lines1):
      line1_content = lines1[i]
    else:
      line1_content = ""
    if i < len(lines2):
      line2_content = lines2[i]
    else:
      line2_content = ""
    changed_lines.append(f"Line {line_num}: {line1_content} -> {line2_content}")

  return changed_lines

# Example usage
str1 = """"""

str2 = """Line 1
Changed line
Same line
New line"""

changed_lines = compare_strings(str1, str2)

if changed_lines:
  print("Changed lines:")
  for line in changed_lines:
    print(line)
else:
  print("No lines changed.")


Changed lines:
Line 1:  -> Line 1
Line 2:  -> Changed line
Line 3:  -> Same line
Line 4:  -> New line


In [None]:
def compare_strings(str1, str2):
  """
  Compares two strings of lines and returns a list of tuples containing information about changed lines in both strings.

  Args:
      str1: The first string to compare.
      str2: The second string to compare.

  Returns:
      A list of tuples containing information about changed lines in both strings.
  """
  # Split the strings into lists of lines
  lines1 = str1.splitlines()
  lines2 = str2.splitlines()

  # Find the longest list of lines
  max_len = max(len(lines1), len(lines2))

  # Initialize lists to store information about changed lines
  changed_lines_in_str1 = []
  changed_lines_in_str2 = []

  # Iterate through the lines
  for i in range(max_len):
    # Check if the lines are equal
    if i < len(lines1) and i < len(lines2) and lines1[i] == lines2[i]:
      continue
    # If lines are not equal, add info about changed lines in str1 and str2
    line_num = i + 1
    if i < len(lines1):
      line1_content = lines1[i]
    else:
      line1_content = ""
    if i < len(lines2):
      line2_content = lines2[i]
    else:
      line2_content = ""
    if line1_content != "":
      changed_lines_in_str1.append(f"Line {line_num} in str1: {line1_content}")
    if line2_content != "":
      changed_lines_in_str2.append(f"Line {line_num} in str2: {line2_content}")

  return changed_lines_in_str1, changed_lines_in_str2

# Example usage
str1 = """Line 1
Line 2
Same line
Line 4"""

str2 = """Line 1
Changed line
Same line
New line"""

lines_changed_in_str1, lines_changed_in_str2 = compare_strings(str1, str2)

if lines_changed_in_str1:
  print("Lines changed in str1:")
  for line in lines_changed_in_str1:
    print(line)

if lines_changed_in_str2:
  print("Lines changed in str2:")
  for line in lines_changed_in_str2:
    print(line)

if not lines_changed_in_str1 and not lines_changed_in_str2:
  print("No lines changed.")


Lines changed in str1:
Line 2 in str1: Line 2
Line 4 in str1: Line 4
Lines changed in str2:
Line 2 in str2: Changed line
Line 4 in str2: New line


In [None]:
import requests
from pydriller import Repository
import os

class GithubFetcher:
    def __init__(self, owner, repo, api_keys):
        self.api_keys = api_keys
        self.current_key_index = 0
        self.rate_limit_wait_time = 60
        self.max_retries = 3
        self.retryable_exceptions = (
            requests.exceptions.RequestException,
            requests.exceptions.ConnectionError,
            requests.exceptions.Timeout,
            requests.exceptions.HTTPError,
        )
        self.reset_times = {}
        self.base_url = "https://api.github.com"
        self.owner = owner
        self.repo = repo
        self.issues = []
        self.commits = []

    def _key_handler(self):
        if len(self.api_keys) == 0:
            print("No API keys available")
            return None

        key = self.api_keys[self.current_key_index]
        self.current_key_index = (self.current_key_index + 1) % len(self.api_keys)
        return key

    def get_rate_limit_info(self, response):
        rate_limit_info = {
            "limit": int(response.headers["X-RateLimit-Limit"]),
            "remaining": int(response.headers["X-RateLimit-Remaining"]),
            "reset": int(response.headers["X-RateLimit-Reset"]),
        }
        return rate_limit_info

    def get(self, url, headers=None, params=None):
        if headers is None:
            headers = {}

        for attempt in range(1, self.max_retries + 1):
            try:
                headers["Authorization"] = f"token {self._key_handler()}"
                response = requests.get(url, headers=headers, params=params)
                response.raise_for_status()

                rate_limit_info = self.get_rate_limit_info(response)
                print(f"Rate limit info: {rate_limit_info},key used: {headers['Authorization']}")
                if rate_limit_info["remaining"] == 0:
                    print(
                        f"Rate limit reached. Waiting for {self.rate_limit_wait_time} seconds..."
                    )
                    time.sleep(self.rate_limit_wait_time)
                    continue

                if "GitHub-Authentication-Token-Expiration" in response.headers:
                    expiration_time = int(
                        response.headers["GitHub-Authentication-Token-Expiration"]
                    )
                    self.reset_times[
                        self.api_keys[
                            (self.current_key_index - 1 + len(self.api_keys))
                            % len(self.api_keys)
                        ]
                    ] = expiration_time
                    print(f"Key expires at {expiration_time}")

                current_key = self.api_keys[self.current_key_index - 1]
                if (
                    current_key in self.reset_times
                    and time.time() > self.reset_times[current_key]
                ):
                    del self.api_keys[self.current_key_index - 1]
                    del self.reset_times[current_key]
                    print(f"Key {current_key} expired. Removing.")
                    continue

                print("headers",headers)
                # json.dump(response.json(), open("data1.json", "w"))
                return response.json()

            except self.retryable_exceptions as e:
                if attempt == self.max_retries:
                    print(
                        f"Attempt {attempt}/{self.max_retries}: {e.__class__.__name__}"
                    )
                    if response:
                        print(f"Error details: {response.text}")
                    # raise
                else:
                    print(
                        f"Attempt {attempt}/{self.max_retries}: {e.__class__.__name__} occurred. Retrying..."
                    )

        return None

    def get_issues(self,params):
        url = f"{self.base_url}/repos/{self.owner}/{self.repo}/issues"
        headers = {"Authorization": f"token {self._key_handler()}"}
        response = requests.get(url, headers=headers,params = params)
        if response.status_code == 200:
            self.issues = response.json()
            self.save_issues(self.issues)
            return self.issues
        else:
            print(f"Failed to fetch issues: {response.status_code}")
            return []

    def get_commits(self, from_date=None, to_date=None):
        ret = []
        if from_date is None:
            self.commits = list(Repository(f"https://github.com/{self.owner}/{self.repo}").traverse_commits())
            ret = self.commits
            self.save_commits(self.commits)
        else:
            ret = list(Repository(f"https://github.com/{self.owner}/{self.repo}", since=from_date, to=to_date).traverse_commits())
        return ret

    def save_issues(self, issues):
        folder_path = f"{self.owner}/{self.repo}/issues"
        os.makedirs(folder_path, exist_ok=True)
        with open(f"{folder_path}/issues.txt", "w") as f:
            for issue in issues:
                f.write(f"Issue #{issue['number']}: {issue['title']}\n")

    def save_commits(self, commits):
        folder_path = f"{self.owner}/{self.repo}/commits"
        os.makedirs(folder_path, exist_ok=True)
        with open(f"{folder_path}/commits.txt", "w") as f:
            for commit in commits:
                f.write(f"Commit: {commit.hash} by {commit.author.name} - {commit.msg}\n")



# Example usage:
# if __name__ == "__main__":
#     api_keys = ["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"]
#     owner = "lcompilers"
#     repo = "lpython"

#     fetcher = GithubFetcher(owner, repo, api_keys)
#     issues = fetcher.get_issues(None)
#     if issues:
#         print("Issues:")
#         for issue in issues:
#             print(f"Issue #{issue['number']}: {issue['title']}")

#     commits = fetcher.get_commits()


from datetime import datetime
import requests
import json

GitHubHandler = GithubFetcher("shosetsuorg","shosetsu", [""])
issues = GitHubHandler.get_issues({"state":"closed", "per_page":"100"})
# print(issues)

import pydriller
from pydriller import *



def compare_strings(str1, str2):
  """
  Compares two strings of lines and returns a list of tuples containing information about changed lines in both strings.

  Args:
      str1: The first string to compare.
      str2: The second string to compare.

  Returns:
      A list of tuples containing information about changed lines in both strings.
  """
  # Split the strings into lists of lines
  lines1=""
  lines2=""

  if(str1!=None): lines1 = str1.splitlines()
  if(str2!=None): lines2 = str2.splitlines()

  # Find the longest list of lines
  max_len = max(len(lines1), len(lines2))

  # Initialize lists to store information about changed lines
  changed_lines_in_str1 = []
  changed_lines_in_str2 = []

  # Iterate through the lines
  for i in range(max_len):
    # Check if the lines are equal
    if i < len(lines1) and i < len(lines2) and lines1[i] == lines2[i]:
      continue
    # If lines are not equal, add info about changed lines in str1 and str2
    line_num = i + 1
    if i < len(lines1):
      line1_content = lines1[i]
    else:
      line1_content = ""
    if i < len(lines2):
      line2_content = lines2[i]
    else:
      line2_content = ""
    if line1_content != "":
      # changed_lines_in_str1.append(f"Line {line_num} in str1: {line1_content}")
      changed_lines_in_str1.append((line_num, line1_content))
    if line2_content != "":
      # changed_lines_in_str2.append(f"Line {line_num} in str2: {line2_content}")
      changed_lines_in_str2.append((line_num, line2_content))

  return changed_lines_in_str1, changed_lines_in_str2

# # Example usage
# str1 = """Line 1
# Line 2
# Same line
# Line 4"""

# str2 = """Line 1
# Changed line
# Same line
# New line"""

# lines_changed_in_str1, lines_changed_in_str2 = compare_strings(str1, str2)

# if lines_changed_in_str1:
#   print("Lines changed in str1:")
#   for line in lines_changed_in_str1:
#     print(line)

# if lines_changed_in_str2:
#   print("Lines changed in str2:")
#   for line in lines_changed_in_str2:
#     print(line)

# if not lines_changed_in_str1 and not lines_changed_in_str2:
#   print("No lines changed.")

# def compare_strings(string1, string2):
#     lines_string1=""
#     lines_string2=""
#     if(string1!=None): lines_string1 = string1.split('\n')
#     if(string2!=None): lines_string2 = string2.split('\n')

#     changes_before = []
#     changes_after = []

#     for line_num, (line1, line2) in enumerate(zip(lines_string1, lines_string2), start=1):
#         if line1.strip() != line2.strip():
#             changes_before.append((line_num, line1.strip()))
#             changes_after.append((line_num, line2.strip()))

#     print("COMAPRED STRINGS",changes_after)

#     return changes_before, changes_after

# if _name_ == "_main_":
#     string1 = """Your first long string here"""
#     string2 = """Your second long string here"""

#     changes_before, changes_after = compare_strings(string1, string2)

#     print("Lines before the change:")
#     for line_num, line_content in changes_before:
#         print(f"Line {line_num}: {line_content}")

#     print("\nLines after the change:")
#     for line_num, line_content in changes_after:
#         print(f"Line {line_num}: {line_content}")

def _getMethodBody(method, source_code, file):
    """
    Given a method, it returns the body of the method.
    :param method: the method
    :param source_code: the source code
    :param file: the file
    :return: the body of the method
    """
    if method and source_code:
        lines = source_code.split("\n")
        start = method.start_line
        end = method.end_line
        method_body = "\n".join(lines[start - 1 : end])
        return method_body
    return None

def issue_date_generator(issue, type):
  issue_managed_at = issue[type]
  issue_managed_at = issue_managed_at.split('-')
  issue_managed_at[2] = issue_managed_at[2].split('T')
  return [int(issue_managed_at[0]), int(issue_managed_at[1]), int(issue_managed_at[2][0])]

import ollama

# response = ollama.chat(model='llama2', messages=[{
#    'role': 'user',
#    'content': ''' You will receive two objects. One is the lines of code of a method before changes due to a commit and second one is also some lines of code of a method after changes, your job is to give me a one line summarization of what is altered and the meaning of it based on the before and after the method changes, only one line summarization''',
#  },{
#    'role': 'user',
#    'content': ''' What is the value of a+b ?''',
#  }])
#print(response['message']['content'],'\n','\n','\n','\n','\n','\n','\n','\n','\n','\n','\n','\n','\n')

from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cpu" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

issue_commit_map = []
j = 0
for issue in issues:
    # checking...
    # j = j + 1
    if len(issue_commit_map) > 1: break
    # if j <= 5: continue
    # print("j: ", j)
    # if j==2: break
    # date generation
    # print(issue["created_at"])
    # print(issue["closed_at"])
    issue_created_at = issue_date_generator(issue, "created_at")
    issue_closed_at = issue_date_generator(issue, "closed_at")
    dt1 = datetime(issue_created_at[0], issue_created_at[1], issue_created_at[2], 0, 0, 0)
    dt2 = datetime(issue_closed_at[0], issue_closed_at[1], issue_closed_at[2], 23, 59, 0)
    # print(dt1,dt2)
    # fetching commits
    Commits = pydriller.Repository('../Downloads/shostesu/shosetsu', since=dt1, to=dt2).traverse_commits()
    i = 0
    num = 0
    CommitDetails = []
    print(Commits)
    # iterating over all commits
    for commit in Commits:

        print(commit.msg)
        modified_files = []

        # checking...
        # i = i + 1
        # if(i == 4): break
        num = 0
        method_info = ""
        # get all modified files in that particular commit
        for m in commit.modified_files:
            modified_files.append(m)
        method_changes = []

        # iterating over all  modified files
        print(modified_files)
        for file in modified_files:
            if len(file.changed_methods) == 0:
                continue
            # checking...
            num = num + 1
            print("num: ",num)
            iter3 = 0
            # iterating over all modified methods in those modified files
            for method in file.changed_methods:
                iter3 += 1
                # getting whole method before and after
                method_before = next((x for x in file.methods_before if x == method), None)
                print(method_before)
                method_after = next((x for x in file.methods if x == method), None)
                print(method_after)


                #  getting method body before and after
                body_before = _getMethodBody(method_before, file.source_code_before, file)
                body_after = _getMethodBody(method_after, file.source_code, file)

                changes_before = ""
                changes_after = ""

                # getting only the changes of method body if they are changed only
                if body_before == None or body_after == None: pass
                else: changes_before, changes_after =compare_strings(body_before, body_after)


                # getting changes before and after in a single string format
                _changes_before = ""
                _changes_after = ""
                for line_num, line_content in changes_before:
                    _changes_before = _changes_before + "Line" + str(line_num) + ": " + str(line_content) + "\n"
                for line_num, line_content in changes_after:
                    _changes_after = _changes_after + "Line" + str(line_num) + ": " + str(line_content) + "\n"



                print("changes_before: ", _changes_before)
                print("changes_after: ", _changes_after)

                method_changes.append([_changes_before, _changes_after])


                # getting summarization from gemini AI
                prompt = "before changes: " + _changes_before + "after changes: " + _changes_after
                # # response =  GeminiModel.prompt(prompt)

                messages=[{
                    'role': 'user',
                    'content': ''' You will receive two objects. One is the lines of code of a method before changes due to a commit and second one is also some lines of code of a method after changes, your job is to give me a one line summarization of what is altered and the meaning of it based on the before and after the method changes, only one line summarization
                    ''',
                },{
                    'role': 'assistant',
                    'content': 'ok',
                },
                {
                     'role': 'user',
                    'content': prompt,
                }
                ]
                encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

                model_inputs = encodeds.to(device)
                model.to(device)

                generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
                decoded = tokenizer.batch_decode(generated_ids)
                print("summirization: ", decoded[0])
                response = decoded[0]
                temp = response.split("[/INST]")
                response = temp[len(temp)- 1]
                print("summarization: ", temp[len(temp)- 1])
                method_info = method_info + response


                break


                # response = ollama.chat(model='llama2', messages=[{
                #     'role': 'user',
                #     'content': ''' You will receive two objects. One is the lines of code of a method before changes due to a commit and second one is also some lines of code of a method after changes, your job is to give me a one line summarization of what is altered and the meaning of it based on the before and after the method changes, only one line summarization''',
                # },{
                #     'role': 'user',
                #     'content': prompt,
                # }])
                # print("summarization: ", response['message']['content'])
                # method_info = method_info + response['message']['content']


                # method_info = method_info + response

                # print("iter: ", iter3)
                # if iter3 == 2: break
            print("num: ", num)
            # if num == 2: break
            break
        CommitDetails.append({ "index": i, "url" : "", "method_summarization": method_info})
        break
    issue_commit_map.append({"issue_index" : j,"issue_title" : issue["title"], "issue_body" : issue["body"], "issue_html_link": issue["html_url"], "issue_url": issue["url"],"CommitDetails" : CommitDetails })

# print(issue_commit_map)
import json
with open("issues_commit_summarization.json",'w') as f:
    json.dump(issue_commit_map,f)