In [1]:
import sys
from pprint import pprint
from datasets import load_dataset

sys.path.append('..')
from get_repo_structure.get_repo_structure import get_project_structure_from_scratch
from agentless.util.preprocess_data import filter_none_python, filter_out_test_files
from agentless.fl.FL import LLMFL

In [2]:
swe_bench_data = load_dataset("princeton-nlp/SWE-bench_Lite", split="test")

In [3]:
bench_data = next(iter(swe_bench_data))
bench_data

{'repo': 'astropy/astropy',
 'instance_id': 'astropy__astropy-12907',
 'base_commit': 'd16bfe05a744909de4b27f5875fe0d4ed41ce607',
 'patch': "diff --git a/astropy/modeling/separable.py b/astropy/modeling/separable.py\n--- a/astropy/modeling/separable.py\n+++ b/astropy/modeling/separable.py\n@@ -242,7 +242,7 @@ def _cstack(left, right):\n         cright = _coord_matrix(right, 'right', noutp)\n     else:\n         cright = np.zeros((noutp, right.shape[1]))\n-        cright[-right.shape[0]:, -right.shape[1]:] = 1\n+        cright[-right.shape[0]:, -right.shape[1]:] = right\n \n     return np.hstack([cleft, cright])\n \n",
 'test_patch': "diff --git a/astropy/modeling/tests/test_separable.py b/astropy/modeling/tests/test_separable.py\n--- a/astropy/modeling/tests/test_separable.py\n+++ b/astropy/modeling/tests/test_separable.py\n@@ -28,6 +28,13 @@\n p1 = models.Polynomial1D(1, name='p1')\n \n \n+cm_4d_expected = (np.array([False, False, True, True]),\n+                  np.array([[True,  Tr

In [4]:
d = get_project_structure_from_scratch(
    repo_name = bench_data['repo'], 
    commit_id = bench_data['base_commit'], 
    instance_id = bench_data['instance_id'],
    repo_playground = 'playground',
)

Cloning repository from https://github.com/astropy/astropy.git to playground/31f58a3f-332d-44cf-946d-645790494f0f/astropy...


Cloning into 'playground/31f58a3f-332d-44cf-946d-645790494f0f/astropy'...


Repository cloned successfully.
Checking out commit d16bfe05a744909de4b27f5875fe0d4ed41ce607 in repository at playground/31f58a3f-332d-44cf-946d-645790494f0f/astropy...


Note: switching to 'd16bfe05a744909de4b27f5875fe0d4ed41ce607'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at d16bfe05a7 Merge pull request #12900 from Cadair/custom_compound_model


Commit checked out successfully.


In [5]:
problem_statement = bench_data['problem_statement'] # issue card
print(f'Problem statement is a: {type(problem_statement)}\n')
print(problem_statement)

Problem statement is a: <class 'str'>

Modeling's `separability_matrix` does not compute separability correctly for nested CompoundModels
Consider the following model:

```python
from astropy.modeling import models as m
from astropy.modeling.separable import separability_matrix

cm = m.Linear1D(10) & m.Linear1D(5)
```

It's separability matrix as you might expect is a diagonal:

```python
>>> separability_matrix(cm)
array([[ True, False],
       [False,  True]])
```

If I make the model more complex:
```python
>>> separability_matrix(m.Pix2Sky_TAN() & m.Linear1D(10) & m.Linear1D(5))
array([[ True,  True, False, False],
       [ True,  True, False, False],
       [False, False,  True, False],
       [False, False, False,  True]])
```

The output matrix is again, as expected, the outputs and inputs to the linear models are separable and independent of each other.

If however, I nest these compound models:
```python
>>> separability_matrix(m.Pix2Sky_TAN() & cm)
array([[ True,  True, False

In [6]:
structure = d['structure'] # the files in the repo in dict format

filter_none_python(structure) # filter out non-python files and empty files
if not d["instance_id"].startswith("pytest"): # filter out test files unless we are debugging pytest itself
    filter_out_test_files(structure)

structure.keys()

dict_keys(['astropy', '.pyinstaller', 'docs', 'examples'])

In [7]:
import logging
logger = logging.getLogger() # simple logger to pass to LLMFL

# LLMFL stands for Large Language Model Fault Localization
fl = LLMFL(
    d['instance_id'], 
    structure, 
    problem_statement,
    model_name='gpt-4o-2024-05-13',
    backend='openai',
    match_partial_paths=False,
    logger=logger,
)

## Localize to suspicious files

In [8]:
found_files, additional_loc_file, file_traj = fl.localize()

prompting with message:
Please look through the following GitHub problem description and Repository structure and provide a list of files that one would need to edit to fix the problem.

### GitHub Problem Description ###
Modeling's `separability_matrix` does not compute separability correctly for nested CompoundModels
Consider the following model:

```python
from astropy.modeling import models as m
from astropy.modeling.separable import separability_matrix

cm = m.Linear1D(10) & m.Linear1D(5)
```

It's separability matrix as you might expect is a diagonal:

```python
>>> separability_matrix(cm)
array([[ True, False],
       [False,  True]])
```

If I make the model more complex:
```python
>>> separability_matrix(m.Pix2Sky_TAN() & m.Linear1D(10) & m.Linear1D(5))
array([[ True,  True, False, False],
       [ True,  True, False, False],
       [False, False,  True, False],
       [False, False, False,  True]])
```

The output matrix is again, as expected, the outputs and inputs to the li

In [9]:
pprint(file_traj) # The trajectory of the LLM (dictionary of promt, response, and token usage)

{'prompt': 'Please look through the following GitHub problem description and '
           'Repository structure and provide a list of files that one would '
           'need to edit to fix the problem.\n'
           '\n'
           '### GitHub Problem Description ###\n'
           "Modeling's `separability_matrix` does not compute separability "
           'correctly for nested CompoundModels\n'
           'Consider the following model:\r\n'
           '\r\n'
           '```python\r\n'
           'from astropy.modeling import models as m\r\n'
           'from astropy.modeling.separable import separability_matrix\r\n'
           '\r\n'
           'cm = m.Linear1D(10) & m.Linear1D(5)\r\n'
           '```\r\n'
           '\r\n'
           "It's separability matrix as you might expect is a diagonal:\r\n"
           '\r\n'
           '```python\r\n'
           '>>> separability_matrix(cm)\r\n'
           'array([[ True, False],\r\n'
           '       [False,  True]])\r\n'
           '```\r

In [10]:
pprint(additional_loc_file) # Raw output from the model

{'raw_output_files': '```\n'
                     'astropy/modeling/separable.py\n'
                     'astropy/modeling/core.py\n'
                     'astropy/modeling/models.py\n'
                     'astropy/modeling/utils.py\n'
                     'astropy/modeling/tests/test_separable.py\n'
                     '```'}


In [11]:
pprint(found_files) # list of suspicious files

['astropy/modeling/separable.py',
 'astropy/modeling/core.py',
 'astropy/modeling/models.py',
 'astropy/modeling/utils.py']


## Localize to related elements

In [12]:
top_n = 3
pred_files = found_files[:top_n]
pprint(pred_files)

['astropy/modeling/separable.py',
 'astropy/modeling/core.py',
 'astropy/modeling/models.py']


In [13]:
found_related_locs, additional_artifact_loc_related, related_loc_traj = fl.localize_function_from_compressed_files(pred_files)

Initializing a decoder model: gpt-4o-2024-05-13 ...
```
astropy/modeling/separable.py
function: separability_matrix
function: _coord_matrix
function: _cstack

astropy/modeling/core.py
class: CompoundModel
function: CompoundModel._make_leaflist
function: CompoundModel.traverse_postorder
```


In [14]:
pprint(related_loc_traj) # The trajectory of the LLM (dictionary of promt, response, and token usage)

{'prompt': '\n'
           'Please look through the following GitHub Problem Description and '
           'the Skeleton of Relevant Files.\n'
           'Identify all locations that need inspection or editing to fix the '
           'problem, including directly related areas as well as any '
           'potentially related global variables, functions, and classes.\n'
           'For each location you provide, either give the name of the class, '
           'the name of a method in a class, the name of a function, or the '
           'name of a global variable.\n'
           '\n'
           '### GitHub Problem Description ###\n'
           "Modeling's `separability_matrix` does not compute separability "
           'correctly for nested CompoundModels\n'
           'Consider the following model:\r\n'
           '\r\n'
           '```python\r\n'
           'from astropy.modeling import models as m\r\n'
           'from astropy.modeling.separable import separability_matrix\r\n'
          

In [15]:
pprint(additional_artifact_loc_related) # Raw output from the model

{'raw_output_loc': '```\n'
                   'astropy/modeling/separable.py\n'
                   'function: separability_matrix\n'
                   'function: _coord_matrix\n'
                   'function: _cstack\n'
                   '\n'
                   'astropy/modeling/core.py\n'
                   'class: CompoundModel\n'
                   'function: CompoundModel._make_leaflist\n'
                   'function: CompoundModel.traverse_postorder\n'
                   '```'}


In [16]:
pprint(found_related_locs) # list of suspicious elements

[['function: separability_matrix\nfunction: _coord_matrix\nfunction: _cstack'],
 ['class: CompoundModel\n'
  'function: CompoundModel._make_leaflist\n'
  'function: CompoundModel.traverse_postorder'],
 ['']]


## Localize to edit locations

In [17]:
coarse_found_locs = {}
for i, pred_file in enumerate(pred_files):
    if len(found_related_locs) > i:
        coarse_found_locs[pred_file] = found_related_locs[i]
pprint(coarse_found_locs)

{'astropy/modeling/core.py': ['class: CompoundModel\n'
                              'function: CompoundModel._make_leaflist\n'
                              'function: CompoundModel.traverse_postorder'],
 'astropy/modeling/models.py': [''],
 'astropy/modeling/separable.py': ['function: separability_matrix\n'
                                   'function: _coord_matrix\n'
                                   'function: _cstack']}


In [18]:
context_window = 10

found_edit_locs, additional_artifact_loc_edit_location, edit_loc_traj = fl.localize_line_from_coarse_function_locs(
    pred_files,
    coarse_found_locs,
    context_window=context_window,
    add_space=False,
    no_line_number=False,
    sticky_scroll=False,
    temperature=0.0,
    num_samples=1,
)

Initializing a decoder model: gpt-4o-2024-05-13 ...
```
astropy/modeling/separable.py
function: _coord_matrix
line: 202
line: 204
line: 206
line: 208
line: 210
line: 213
line: 215

astropy/modeling/separable.py
function: _cstack
line: 245
```


In [19]:
pprint(edit_loc_traj) # The trajectory of the LLM (dictionary of promt, response, and token usage)

{'prompt': '\n'
           'Please review the following GitHub problem description and '
           'relevant files, and provide a set of locations that need to be '
           'edited to fix the issue.\n'
           'The locations can be specified as class names, function or method '
           'names, or exact line numbers that require modification.\n'
           '\n'
           '### GitHub Problem Description ###\n'
           "Modeling's `separability_matrix` does not compute separability "
           'correctly for nested CompoundModels\n'
           'Consider the following model:\r\n'
           '\r\n'
           '```python\r\n'
           'from astropy.modeling import models as m\r\n'
           'from astropy.modeling.separable import separability_matrix\r\n'
           '\r\n'
           'cm = m.Linear1D(10) & m.Linear1D(5)\r\n'
           '```\r\n'
           '\r\n'
           "It's separability matrix as you might expect is a diagonal:\r\n"
           '\r\n'
           '```pyt

In [20]:
pprint(additional_artifact_loc_edit_location) # Raw output from the model

{'raw_output_loc': ['```\n'
                    'astropy/modeling/separable.py\n'
                    'function: _coord_matrix\n'
                    'line: 202\n'
                    'line: 204\n'
                    'line: 206\n'
                    'line: 208\n'
                    'line: 210\n'
                    'line: 213\n'
                    'line: 215\n'
                    '\n'
                    'astropy/modeling/separable.py\n'
                    'function: _cstack\n'
                    'line: 245\n'
                    '```']}


In [21]:
pprint(found_edit_locs) # list of suspicious lines

[['function: _coord_matrix\n'
  'line: 202\n'
  'line: 204\n'
  'line: 206\n'
  'line: 208\n'
  'line: 210\n'
  'line: 213\n'
  'line: 215\n'
  'function: _cstack\n'
  'line: 245'],
 [''],
 ['']]
