In [25]:
import pandas as pd
import ollama

df = pd.read_csv("injuries_2010-2020.csv")

df = df[~df['Notes'].str.lower().str.contains("activated from il")]
df = df[~df['Notes'].str.lower().str.contains("placed on il")]

def extract_injury_location(note):
    try:
        response = ollama.chat(
            model="llama3",
            messages=[{
                "role": "user",
                "content": f"Extract just the body part injured in this note: '{note}'. Respond with only the body part, one or two words, and nothing else. Side of the body doesn't matter so if its left ankle it's just ankle"
            }]
        )
        return response['message']['content'].strip()
    except Exception as e:
        return f"Error extracting injury location: {e}"

print(df.shape)

df['Injury Location'] = df['Notes'].apply(extract_injury_location)

df['Notes'] = df['Injury Location']

df.drop(columns=['Injury Location'], inplace=True)

output_file = "modified_injury_locations.csv"
df.to_csv(output_file, index=False)

print(f"Data has been saved to {output_file}")


(11364, 5)
Data has been saved to modified_injury_locations.csv


Remove rows with no injury etc

In [34]:
import pandas as pd

# Load the processed dataset
df = pd.read_csv("modified_injury_locations.csv")
original_count = len(df)

# Standardize notes: convert to string, strip spaces
df['Notes'] = df['Notes'].astype(str).str.strip()

# Track how many rows removed
removed_total = 0

# Step 1: Remove rows with 'rest' 'dnp' 'none'
before = len(df)
df = df[~df['Notes'].str.lower().isin(['rest', 'dnp', 'none'])]
after = len(df)
print(f"Removed {before - after} rows with 'rest', 'none', or 'dnp'")
removed_total += before - after

# Step 2: Remove rows that start with 'Error'
before = len(df)
df = df[~df['Notes'].str.startswith("Error")]
after = len(df)
print(f"Removed {before - after} rows with LLM errors")
removed_total += before - after

# Step 3: Remove rows with more than 2 words
before = len(df)
df = df[df['Notes'].str.split().str.len() <= 2]
after = len(df)
print(f"Removed {before - after} rows with long descriptions")
removed_total += before - after

# Reset the index for a clean DataFrame
df = df.reset_index(drop=True)

# Save the cleaned dataset
df.to_csv("injury_locations_cleaned.csv", index=False)

# Final summary
print(f"\nOriginal rows: {original_count}")
print(f"Final rows: {len(df)}")
print(f"Total removed: {removed_total}")


Removed 2038 rows with 'rest', 'none', or 'dnp'
Removed 0 rows with LLM errors
Removed 217 rows with long descriptions

Original rows: 11364
Final rows: 9109
Total removed: 2255


In [70]:
import pandas as pd

df = pd.read_csv("injury_locations_categorized.csv")

# Clean the Notes column to avoid type issues
df['Injury Category'] = df['Injury Category'].astype(str).str.strip()

# Drop rows that are empty or 'nan'
df = df[df['Injury Category'].str.lower() != 'nan']

# Get sorted unique injuries
unique_injuries = sorted(df['Injury Category'].unique())
print(f"Found {len(unique_injuries)} unique injury locations:\n")
for injury in unique_injuries:
    print(injury)


Found 27 unique injury locations:

abdomen
ankle
arm
back
chest
eye
face
finger
foot
glute
groin
hand
hip
internal
knee
leg
lungs
misc
muscle
neck
rib
seasonal/other
shoulder
teeth
throat
toe
wrist


In [88]:
df = pd.read_csv("injury_locations_cleaned.csv")
df['Notes'] = df['Notes'].astype(str).str.strip().str.lower()    .str.replace(r'\s+', ' ', regex=True)


injury_mapping = {
    "ankle": [
        "ankle", "foot ankle", "foot/ankle", "leg ankle", "tibia ankle",
        "soreness/ankle", "ankle ", "ankle.", "ankle/knee", "heel", "ankle elbow",
        "ankle shoulder", "back ankle"
    ],
    "knee": [
        "knee", "knees", "knee cap", "knee meniscus", "kneecap", "knee tendon",
        "knee tendons", "knee, ankle", "knee hip", "knee ankle", "meniscus knee",
        "patella", "meniscus", "calf knee", "ankle knee", "hip knee", "knee hamstring",
        "knee shoulder"
    ],
    "foot": [
        "foot", "foot.", "right foot", "arch", "metatarsal", "midfoot"
    ],
    "toe": ["toe", "big toe", "little toe", "toe thumb", "toenail", "thumb toe"],
    "finger": [
        "finger", "index finger", "middle finger", "ring finger", "pinky finger",
        "little finger", "hand/finger", "right hand/finger", "thumb"
    ],
    "hip": ["hip", "hip flexor", "hip pointer", "hip/knee", "pelvis", "tailbone", "symphysis", "si joint", "hip knee"],
    "shoulder": ["shoulder", "neck shoulder", "rotator cuff", "axilla", "collarbone", "labrum"],
    "back": ["back", "lower back", "upper back", "spinal cord", "lat"],
    "wrist": ["wrist", "hand/wrist", "wrist ankle", "wrist knee"],
    "hand": ["hand", "right hand", "hand/thumb"],
    "chest": ["chest", "chest muscle", "pectoral", "pectoral muscle", "pectoralis", "thorax", "sternum", 
              "stemum", "chest shoulder", "ribs knee", "rib", "ribs"
    ],
    "abdomen": ["abdomen", "abdominal muscle", "gut", "stomach", "hernia", "oblique", "core"],
    "groin": ["groin", "pubic area", "adductor", "adductor muscle", "abductor", "testicle", "groin hip"],
    "leg": [
        "leg", "right leg", "left leg", "calf", "calf/shin", "shin", "tibia",
        "achilles", "fibula", "thigh", "quad", "quadriceps", "quadricep", "quadricap", "hamstring", "hamstrings", 
        "leg ankle", "meniscus calf", "ankle calf", "calf elbow", "foot hamstring", "foot knee", "hamstring back", 
        "hamstring knee", "leg tibia"
    ],
    "arm": ["arm", "bicep", "forearm", "tricep", "triceps", "elbow"],
    "face": ["face", "cheek", "cheekbone", "jaw", "mouth", "nose", "facial bone", "head", "brain", "ear", "forehead"],
    "neck": ["neck", "cervical"],
    "eye": ["eye", "cornea", "orbital", "orbital wall", "orbital bone", "orbital floor", "eyelid"],
    "lungs": ["lung", "lungs", "respiratory", "respiratory system"],
    "throat": ["throat", "tonsils", "sinus", "sinuses"],
    "teeth": ["tooth", "teeth", "wisdom tooth"],
    "glute": ["glute", "gluteus"],
    "muscle": ["fascia", "muscle"],
    "internal": ["intestine", "intestines", "append", "appendix", "heart", "heath"],
    "seasonal/other": ["season", "health", "nothing", "no injury", "root", "dtd", "blood", "skin", "nan", "other"]
}


flat_map = {term.lower(): category for category, terms in injury_mapping.items() for term in terms}

def normalize(note):
    note_clean = note.lower().strip()
    return flat_map.get(note_clean, "misc")

df['Injury Category'] = df['Notes'].apply(normalize)

uncategorized = df[df['Injury Category'] == "misc"]['Notes'].unique()

print(f"\n⚠️ Found {len(uncategorized)} uncategorized injury terms:\n")
for term in sorted(uncategorized):
    print(term)

df.to_csv("injury_locations_categorized.csv", index=False)
print(f"\n✅ Final categorized dataset saved to injury_locations_categorized.csv with {df['Injury Category'].nunique()} categories.")



⚠️ Found 10 uncategorized injury terms:

forehead
groin hip
hamstring back
hamstring knee
knee shoulder
leg tibia
ribs knee
thumb toe
wrist ankle
wrist knee

✅ Final categorized dataset saved to injury_locations_categorized.csv with 27 categories.
