## Data Cleaning

In [11]:
import pandas as pd
import os

# Configuration
base_path = 'lista_certificados'

# 1. Identify a sample file
files = [f for f in os.listdir(base_path) if f.endswith('.xlsx')]
if files:
    sample_file = files[0]
    sample_file_path = os.path.join(base_path, sample_file)

    print(f"Sample file selected: {sample_file}")

    # 2. Read the file sheets
    xls = pd.ExcelFile(sample_file_path)
    sheet_names = xls.sheet_names
    print(f"Sheets found: {sheet_names}")

    # 3. Read each sheet individually
    # Assuming there are two sheets as per description
    if len(sheet_names) >= 2:
        df_hoja1 = pd.read_excel(sample_file_path, sheet_name=0)
        df_hoja2 = pd.read_excel(sample_file_path, sheet_name=1)

        print("\n--- Sheet 1 Preview ---")
        display(df_hoja1.head())
        print(f"Dimensions: {df_hoja1.shape}")

        print("\n--- Sheet 2 Preview ---")
        display(df_hoja2.head())
        print(f"Dimensions: {df_hoja2.shape}")
    else:
        print("The file does not have at least 2 sheets.")
else:
    print("No .xlsx files found in the folder.")

Sample file selected: 29. INFORME ITEC GENERAL junio 2 - COLEGIO DE PROFESORES.xlsx
Sheets found: ['Sheet1']
The file does not have at least 2 sheets.


In [12]:
import os
import openpyxl

# Configuration
base_path = 'lista_certificados'

print("Starting the process of deleting 'Hoja1' from all Excel files...")

# Iterate through all files in the directory
files = [f for f in os.listdir(base_path) if f.endswith('.xlsx')]

processed_count = 0
errors = []

for filename in files:
    file_path = os.path.join(base_path, filename)
    
    try:
        # Load the workbook
        wb = openpyxl.load_workbook(file_path)
        
        # Check if 'Hoja1' exists
        if 'Hoja1' in wb.sheetnames:
            # Get the sheet and remove it
            std = wb['Hoja1']
            wb.remove(std)
            
            # Save the file
            wb.save(file_path)
            print(f"Processed: {filename} (Removed 'Hoja1')")
            processed_count += 1
        else:
            print(f"Skipped: {filename} ('Hoja1' not found)")
            
    except Exception as e:
        print(f"Error processing {filename}: {e}")
        errors.append(filename)

print(f"\nProcess completed.")
print(f"Total files processed: {processed_count}")
print(f"Total errors: {len(errors)}")

Starting the process of deleting 'Hoja1' from all Excel files...
Skipped: 29. INFORME ITEC GENERAL junio 2 - COLEGIO DE PROFESORES.xlsx ('Hoja1' not found)
Skipped: 5. INFORME ITEC GENERAL  FEBRERO 2.xlsx ('Hoja1' not found)
Skipped: 57. INFORME ITEC GENERAL setiembre 4 - CORLAD.xlsx ('Hoja1' not found)
Skipped: 47. INFORME CORLAD GENERAL agosto 3 - CORLAD.xlsx ('Hoja1' not found)
Skipped: 2. INFORME GENERAL 10 AL 21 ENERO 22.xlsx ('Hoja1' not found)
Skipped: 18. INFORME ITEC GENERAL mayo 3 - COLEGIO DE PROFESORES.xlsx ('Hoja1' not found)
Skipped: 45. INFORME CORLAD GENERAL agosto 2 - CORLAD (1).xlsx ('Hoja1' not found)
Skipped: 6. INFORME ITEC GENERAL FEBRERO 3.xlsx ('Hoja1' not found)
Skipped: 41. INFORME CORLAD GENERAL julio 4 - CORLAD.xlsx ('Hoja1' not found)
Skipped: 34. INFORME ITEC GENERAL julio 1 - CORLAD.xlsx ('Hoja1' not found)
Skipped: 48. INFORME DOCENTES GENERAL agosto 3 - COLEGIO DE PROFESORES.xlsx ('Hoja1' not found)
Skipped: 40. INFORME ITEC GENERAL julio 4 - COLEGIO DE

In [13]:
# Validation: Check that all files have exactly 1 sheet
files = [f for f in os.listdir(base_path) if f.endswith('.xlsx')]
issues = []

print("Validating sheet counts...")
for f in files:
    wb = openpyxl.load_workbook(os.path.join(base_path, f), read_only=True)
    if len(wb.sheetnames) != 1:
        issues.append(f"{f}: {len(wb.sheetnames)} sheets ({wb.sheetnames})")
    wb.close()

if issues:
    print(f"Found {len(issues)} files with unexpected sheet counts:")
    for i in issues:
        print(i)
else:
    print(f"SUCCESS: All {len(files)} files have exactly 1 sheet.")

Validating sheet counts...
SUCCESS: All 61 files have exactly 1 sheet.


In [None]:
print("Generating a preview of the first 3 rows of all files...")

files = [f for f in os.listdir(base_path) if f.endswith('.xlsx')]
preview_frames = []

for filename in files:
    try:
        df = pd.read_excel(os.path.join(base_path, filename))
        # Add a column to identify the source file
        df.insert(0, 'Source File', filename)
        # Take the first 3 rows
        preview_frames.append(df.head(3))
    except Exception as e:
        print(f"Error reading {filename}: {e}")

if preview_frames:
    all_previews = pd.concat(preview_frames, ignore_index=True)
    display(all_previews)
else:
    print("No files to preview.")

In [14]:
s

NameError: name 's' is not defined

In [None]:
import openpyxl

print("Removing the first row and first column from all files...")

processed_count = 0
files = [f for f in os.listdir(base_path) if f.endswith('.xlsx')]

for filename in files:
    file_path = os.path.join(base_path, filename)
    try:
        wb = openpyxl.load_workbook(file_path)
        ws = wb.active
        
        # 1. Delete the first row (Header with title)
        ws.delete_rows(1)
        
        # 2. Delete the first column (Index/Numbering)
        ws.delete_cols(1)
        
        wb.save(file_path)
        processed_count += 1
        
    except Exception as e:
        print(f"Error processing {filename}: {e}")

print(f"Completed. Processed {processed_count} files.")

# Verify with a sample
if files:
    sample = files[0]
    print(f"\nVerifying sample: {sample}")
    df_check = pd.read_excel(os.path.join(base_path, sample))
    display(df_check.head())

Removing the first row and first column from all files...
Completed. Processed 61 files.

Verifying sample: 29. INFORME ITEC GENERAL junio 2 - COLEGIO DE PROFESORES.xlsx


Unnamed: 0,19,DIECINUEVE,23 - mayo - 2022 al 03 - junio - 2022,120 horas,ICA
0,15,QUINCE,07 - junio - 2022 al 18 - junio - 2022,120 horas,LIMA
1,19,DIECINUEVE,07 - junio - 2022 al 18 - junio - 2022,120 horas,LIMA
2,19,DIECINUEVE,07 - junio - 2022 al 18 - junio - 2022,120 horas,LIMA
3,19,DIECINUEVE,07 - junio - 2022 al 18 - junio - 2022,120 horas,LIMA
4,17,DIECISIETE,07 - junio - 2022 al 18 - junio - 2022,120 horas,LIMA
