### Import Libraries

In [13]:
import json
import pandas as pd

### Load Data from dataset

In [14]:
with open('usecase-testcase-dataset.json', 'r') as f:
    data = json.load(f)

### Extract only the usecase part of the dataset

In [15]:
usecases = [item['usecase'] for item in data]

### Convert to Dataframe

In [16]:
df = pd.DataFrame(usecases)

### Handaling anomalies in the dataset.
Merging the following fields
* precondition and preconditions into preconditions
* actors, Actors & actor into actors
* title, name into name
* scenerio, scenario, description into scenario

In [17]:
# -----------------------
# 1. Handle 'precondition' vs 'preconditions'
conflict_precond = df['precondition'].notna() & df['preconditions'].notna()
df = df[~conflict_precond]
df.loc[df['precondition'].notna() & df['preconditions'].isna(), 'preconditions'] = df['precondition']
df = df.drop(columns=['precondition'], errors='ignore')

# -----------------------
# 2. Handle 'title' vs 'name'
conflict_title = df['title'].notna() & df['name'].notna()
df = df[~conflict_title]
df.loc[df['title'].notna() & df['name'].isna(), 'name'] = df['title']
df = df.drop(columns=['title'], errors='ignore')

# -----------------------
# 3. Handle 'scenerio' vs 'scenario'
conflict_scenario = df['scenerio'].notna() & df['scenario'].notna()
df = df[~conflict_scenario]
df.loc[df['scenerio'].notna() & df['scenario'].isna(), 'scenario'] = df['scenerio']
df = df.drop(columns=['scenerio'], errors='ignore')


# -----------------------
# 4. Handle 'description' vs 'scenario'
conflict_scenario = df['description'].notna() & df['scenario'].notna()
df = df[~conflict_scenario]
df.loc[df['description'].notna() & df['scenario'].isna(), 'scenario'] = df['description']
df = df.drop(columns=['description'], errors='ignore')

# -----------------------
# 5. Handle 'actor' vs 'actors'
conflict_scenario = df['actor'].notna() & df['actors'].notna()
df = df[~conflict_scenario]
df.loc[df['actor'].notna() & df['actors'].isna(), 'actors'] = df['actor']
df = df.drop(columns=['actor'], errors='ignore')

# -----------------------
# 6. Handle 'actors' vs 'Actors'
conflict_scenario = df['actors'].notna() & df['Actors'].notna()
df = df[~conflict_scenario]
df.loc[df['Actors'].notna() & df['actors'].isna(), 'actors'] = df['Actors']
df = df.drop(columns=['Actors'], errors='ignore')

### Drop some unwanted fields: 'author', 'id', 'author_student_id', 'postconditions'
Only 9 rows had postconditions, hence the impact of it will be very insignificant

In [18]:
df = df.drop(columns=['author', 'id', 'author_student_id', 'postconditions'], errors='ignore')

### Clean and format steps
Steps is an list. Convert it into a string with indexed step number

In [19]:
def format_steps(steps):
    if isinstance(steps, list):
        return " ".join([f"{i+1}: {step}" for i, step in enumerate(steps)])
    return ""

df['steps'] = df['steps'].apply(format_steps)

In [20]:
# # -----------------------
# # 9. Normalize 'actors' field to always be a comma-separated string
# def normalize_actors(actors):
#     if isinstance(actors, list):
#         return ", ".join(str(actor).strip() for actor in actors)
#     elif isinstance(actors, str):
#         return actors.strip()
#     return ""

# df['actors'] = df['actors'].apply(normalize_actors)

### Finally Save the cleaned data

In [21]:
df.to_json('usecase2brd-dataset/cleaned_usecases.json', orient='records', indent=2)
df.to_csv('usecase2brd-dataset/cleaned_usecases.csv', index=False)