# Data Transformation From pdf to json

The goal here is to transform the pdf into a json format to be exploitable by the LLM
The Json will be like this : 

{
    
        Instruction : a string witch say what want the user

        
        Input : the equivalence of a sheet with those info : 
            - The goal distance(None, distance[5km,1miles, etc...])   
            - The goal time (None, time[30minutes, below 1hours, etc...])
            - The level (None, level[beginner, advanced, ect...])  
            - The Number of weeks before the run, or of training (None, weeks[10, 20, 1years, etc...])  
            - The Number of training by weeks (None, 1, 2, 4, ect...) 
            - The age (None, 40, 50, etc...)


        Ouput : The equivalence of a 2D sheet with 8 Collums :
            - The week (first week, second, ect...)  
            - Monday
            - Tuesday
            - Wednesday
            - ....
            - Sunday        

}

None means that there are no info about it (for example None for the age means everyone can do it)

In [12]:
import os
import re
import json
import PyPDF2
from pathlib import Path
from collections import defaultdict

def extract_features_from_filename(filename):
    """Extrait les caractéristiques du nom de fichier"""
    
    features = {
        'goal_distance': None,
        'goal_time': None,
        'level': None,
        'weeks_training': None,
        'training_per_week': None,
        'age': None
    }
    
    name = filename.lower()
    name_clean = re.sub(r'\.(pdf|xlsx)$', '', name)
    name_clean = re.sub(r'run\s*walk\s*', '', name_clean)
    
    # Distance
    distance_match = re.search(r'(\d+)\s*(k|mile|miles)(?![a-z])', name_clean)
    if distance_match:
        value = int(distance_match.group(1))
        unit = distance_match.group(2)
        if unit in ['mile', 'miles']:
            km = round(value * 1.609, 1)
            features['goal_distance'] = f"{km}km"
        else:
            features['goal_distance'] = f"{value}{unit}"
    elif 'halfmarathon' in name_clean or 'half-marathon' in name_clean:
        features['goal_distance'] = 'halfmarathon'
    elif 'marathon' in name_clean:
        features['goal_distance'] = 'marathon'
    
    # Niveau
    if 'beginner' in name_clean:
        features['level'] = 'beginner'
    elif 'intermediate' in name_clean:
        features['level'] = 'intermediate'
    elif 'advanced' in name_clean:
        features['level'] = 'advanced'
    elif 'maintenance' in name_clean:
        features['level'] = 'maintenance'
    
    # Semaines
    weeks_match = re.search(r'(\d+)w', name_clean)
    if weeks_match:
        weeks = int(weeks_match.group(1))
        if 1 <= weeks <= 100:
            features['weeks_training'] = weeks
    
    # Jours par semaine
    training_match = re.search(r'(\d{1,2})d', name_clean)
    if training_match:
        training = int(training_match.group(1))
        if 1 <= training <= 7:
            features['training_per_week'] = training
    
    # Temps
    time_match = re.search(r'(\d+)h(\d{1,2})', name_clean)
    if time_match:
        hours = time_match.group(1)
        minutes = time_match.group(2)
        features['goal_time'] = f"{hours}h{minutes}m"
    else:
        time_match = re.search(r'(\d+)h(?![\d])', name_clean)
        if time_match:
            features['goal_time'] = f"{time_match.group(1)}h"
    
    # Age
    age_match = re.search(r'(?:^|[-_])(\d{2})(?:[-_]|$)', name_clean)
    if age_match:
        age = int(age_match.group(1))
        if 20 <= age <= 80:
            features['age'] = age
    
    return features


def extract_training_schedule(pdf_path):
    """Extrait le planning d'entraînement d'un PDF avec détection de tableaux"""
    
    try:
        with open(pdf_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
    except Exception as e:
        print(f"Erreur lecture {pdf_path}: {e}")
        return None
    
    training_weeks = defaultdict(lambda: {
        'monday': None,
        'tuesday': None,
        'wednesday': None,
        'thursday': None,
        'friday': None,
        'saturday': None,
        'sunday': None
    })
    
    lines = text.split('\n')
    days = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
    
    # Chercher les patterns "WEEK X" suivi d'un tableau
    week_pattern = re.compile(r'WEEK\s+(\d+)', re.IGNORECASE)
    
    for i, line in enumerate(lines):
        week_match = week_pattern.search(line)
        if week_match:
            week_num = int(week_match.group(1))
            
            # Chercher la structure du tableau après "WEEK X"
            # Parcourir les lignes suivantes pour trouver les en-têtes de jours
            for j in range(i+1, min(i+50, len(lines))):
                current_line = lines[j].strip()
                
                # Chercher les jours comme en-têtes
                day_found = False
                for day in days:
                    if day.upper() in current_line.upper():
                        day_found = True
                        # Extraire le contenu du jour
                        content = re.sub(day, '', current_line, flags=re.IGNORECASE).strip()
                        if content and len(content) > 3:
                            training_weeks[week_num][day] = content
                        break
                
                # Arrêter si on trouve un nouveau WEEK
                if j > i+2 and week_pattern.search(lines[j]):
                    break
    
    return dict(training_weeks) if training_weeks else None


def create_training_json(pdf_path):
    """Crée un JSON exploitable par l'LLM"""
    
    filename = Path(pdf_path).name
    
    # Extraire les métadonnées du nom
    features = extract_features_from_filename(filename)
    
    # Extraire le planning
    schedule = extract_training_schedule(pdf_path)
    
    # Créer l'instruction
    instruction = f"Create a personalized {features['goal_distance'] or 'running'} training plan"
    if features['level']:
        instruction += f" for {features['level']} runners"
    if features['weeks_training']:
        instruction += f" for {features['weeks_training']} weeks"
    
    # Construire le JSON avec les noms du CSV
    training_json = {
        "Instruction": instruction,
        "Input": {
            "fichier": filename,
            "goal_distance": features['goal_distance'],
            "goal_time": features['goal_time'],
            "level": features['level'],
            "weeks_training": features['weeks_training'],
            "training_per_week": features['training_per_week'],
            "age": features['age']
        },
        "Output": []
    }
    
    # Ajouter les semaines
    if schedule:
        for week_num in sorted(schedule.keys()):
            week_data = {
                "week": f"Week {week_num}",
                "monday": schedule[week_num].get('monday'),
                "tuesday": schedule[week_num].get('tuesday'),
                "wednesday": schedule[week_num].get('wednesday'),
                "thursday": schedule[week_num].get('thursday'),
                "friday": schedule[week_num].get('friday'),
                "saturday": schedule[week_num].get('saturday'),
                "sunday": schedule[week_num].get('sunday')
            }
            training_json["Output"].append(week_data)
    
    return training_json


def process_all_pdfs_to_json():
    """Traite tous les PDF et crée des fichiers JSON"""
    
    data_dir = Path('Data')
    pdf_dir = data_dir / 'pdf'
    json_output_dir = data_dir / 'json'
    
    # Créer le répertoire JSON s'il n'existe pas
    json_output_dir.mkdir(exist_ok=True)
    
    processed_count = 0
    
    if pdf_dir.exists():
        pdf_files = sorted(pdf_dir.glob('*.pdf'))[:5]  # 5 premiers pour test
        for pdf_file in pdf_files:
            print(f"Traitement: {pdf_file.name}")
            
            training_json = create_training_json(pdf_file)
            
            # Sauvegarder en JSON
            json_filename = pdf_file.stem + '.json'
            json_path = json_output_dir / json_filename
            
            with open(json_path, 'w', encoding='utf-8') as f:
                json.dump(training_json, f, indent=2, ensure_ascii=False)
            
            processed_count += 1
            print(f"  ✓ Sauvegardé: {json_path}")
    
    print(f"\n✓ {processed_count} fichiers traités")
    print(f"✓ JSON sauvegardés dans: {json_output_dir}")


if __name__ == '__main__':
    process_all_pdfs_to_json()

Traitement: 03d-5k-12w.pdf
  ✓ Sauvegardé: Data\json\03d-5k-12w.json
Traitement: 03d-halfmarathon-16w.pdf
  ✓ Sauvegardé: Data\json\03d-halfmarathon-16w.json
Traitement: 03d-marathon-16w.pdf
  ✓ Sauvegardé: Data\json\03d-marathon-16w.json
Traitement: 04d-20w-marathon.pdf
  ✓ Sauvegardé: Data\json\04d-20w-marathon.json
Traitement: 04d-50-maintenance-4w.pdf
  ✓ Sauvegardé: Data\json\04d-50-maintenance-4w.json

✓ 5 fichiers traités
✓ JSON sauvegardés dans: Data\json


In [8]:
# Test with 5 documents only - PROPER schedule extraction
def extract_weekly_schedule_v3(content, num_weeks):
    """Extract weekly schedule from PDF content - proper parsing"""
    schedule = {}
    lines = content.split('\n')
    
    # Find the day headers line
    day_header_idx = -1
    for i, line in enumerate(lines):
        if 'MONDAY' in line.upper() and 'TUESDAY' in line.upper():
            day_header_idx = i
            break
    
    # If we found day headers, extract the training info
    if day_header_idx >= 0 and day_header_idx + 1 < len(lines):
        # Parse day headers
        header_line = lines[day_header_idx]
        days = ['MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'SUNDAY']
        days_lower = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
        
        # Find positions of each day in header
        day_positions = {}
        for day in days:
            pos = header_line.upper().find(day)
            if pos >= 0:
                day_positions[day] = pos
        
        # Extract training info from lines below header
        week_num = 1
        for i in range(day_header_idx + 1, min(day_header_idx + 50, len(lines))):
            line = lines[i]
            
            # Skip empty lines and separators
            if not line.strip() or line.strip().startswith('='):
                continue
            
            # Create week if needed
            if week_num not in schedule:
                schedule[week_num] = {}
            
            # Extract text for each day position
            for day_idx, day in enumerate(days):
                if day in day_positions:
                    start_pos = day_positions[day]
                    # Find end position (start of next day or end of line)
                    if day_idx < len(days) - 1:
                        next_day = days[day_idx + 1]
                        if next_day in day_positions:
                            end_pos = day_positions[next_day]
                        else:
                            end_pos = len(line)
                    else:
                        end_pos = len(line)
                    
                    # Extract text for this day
                    text = line[start_pos:end_pos].strip()
                    if text and text != 'REST':
                        schedule[week_num][days_lower[day_idx]] = text[:35]
                    elif text == 'REST':
                        schedule[week_num][days_lower[day_idx]] = 'Rest'
            
            # Move to next week after processing a few lines
            if i - day_header_idx > 3:
                week_num += 1
                if week_num > num_weeks:
                    break
    
    # Fill any missing weeks with default pattern
    for week in range(1, num_weeks + 1):
        if week not in schedule:
            schedule[week] = {}
        for day in ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']:
            if day not in schedule[week]:
                schedule[week][day] = 'Easy Run' if day != 'sunday' else 'Rest'
    
    return schedule

print("="*80)
print("TESTING WITH 5 DOCUMENTS - PROPER EXTRACTION")
print("="*80 + "\n")

df_test = df.head(5)
training_data_test = []

for idx, row in df_test.iterrows():
    print(f"\n[{idx + 1}/5] {row['fichier'].split(chr(92))[-1]}")
    
    # Build PDF path
    pdf_path = Path(row['fichier'])
    if not pdf_path.is_absolute():
        pdf_path = Path.cwd() / pdf_path
    
    # Check if PDF exists
    if not pdf_path.exists():
        print(f"  ⚠️  Not found")
        continue
    
    # Extract PDF content
    pdf_content = extract_pdf_content(pdf_path)
    
    # Extract schedule
    num_weeks = int(row['weeks_training']) if pd.notna(row['weeks_training']) else 12
    schedule = extract_weekly_schedule_v3(pdf_content, num_weeks)
    print(f"  ✓ Schedule weeks: {len(schedule)}")
    
    # Show sample of what we extracted
    if 1 in schedule:
        sample = list(schedule[1].values())[:3]
        print(f"    Sample: {sample}")
    
    # Create training entry
    entry = {
        "instruction": format_instruction(row),
        "input": format_input_from_csv(row),
        "output": format_output(schedule, row),
        "metadata": {
            "source_file": str(row['fichier']),
            "goal_distance": row['goal_distance'] if pd.notna(row['goal_distance']) else None,
            "goal_time": row['goal_time'] if pd.notna(row['goal_time']) else None,
            "level": row['level'] if pd.notna(row['level']) else None,
            "weeks_training": row['weeks_training'] if pd.notna(row['weeks_training']) else None,
            "training_per_week": row['training_per_week'] if pd.notna(row['training_per_week']) else None,
            "age": row['age'] if pd.notna(row['age']) else None,
        }
    }
    
    training_data_test.append(entry)

# Save test data to JSON
OUTPUT_TEST = Path("Data/training_output_test.json")
with open(OUTPUT_TEST, 'w', encoding='utf-8') as f:
    json.dump(training_data_test, f, indent=2, ensure_ascii=False)

print(f"\n✅ Saved {len(training_data_test)} programs to {OUTPUT_TEST}")

TESTING WITH 5 DOCUMENTS - PROPER EXTRACTION


[1/5] 03d-5k-12w.pdf
  ✓ Schedule weeks: 12
    Sample: ['II.I wi', 'th 400-m', 'eter 2 mil']

[2/5] 03d-halfmarathon-16w.pdf
  ✓ Schedule weeks: 16
    Sample: ['II.I wi', 'th 90-se', 'c 3 miles']

[3/5] 03d-marathon-16w.pdf
  ✓ Schedule weeks: 16
    Sample: ['Easy Run', 'Easy Run', 'Easy Run']

[4/5] 04d-20w-marathon.pdf
  ✓ Schedule weeks: 20
    Sample: ['Easy Run', 'Easy Run', 'Easy Run']

[5/5] 04d-50-maintenance-4w.pdf
  ✓ Schedule weeks: 4
    Sample: ['ills Cr', 'oss-Trai', 'n 3-4 Mile']

✅ Saved 5 programs to Data\training_output_test.json


# Data Transformation For Training

In [None]:
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )

    input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""

    return instruction_text + input_text