In [1]:
import pandas as pd
import re

In [2]:
import pandas as pd
import numpy as np

def parse_baseball_data(file_path):
    # Define the standard columns
    all_columns = [
        "Year", "League", "Tm", "#Bat", "BatAge", "R/G", "G", "PA", "AB", "R", "H", "2B", "3B", "HR", "RBI", 
        "SB", "CS", "BB", "SO", "BA", "OBP", "SLG", "OPS", "OPS+", "TB", "GDP", "HBP", "SH", "SF", "IBB", "LOB"
    ]
    
    # Read the file and split by double line breaks
    with open(file_path, 'r') as f:
        data = f.read()
    sections = data.strip().split("\n\n")  # Split by double line breaks
    
    parsed_data = []
    
    for section_index, section in enumerate(sections):
        lines = section.strip().split("\n")
        if len(lines) < 3:
            print(f"Skipping section {section_index + 1} (too few lines)")
            continue  # Skip sections without enough lines
        
        # Extract year and league from the first line
        header_line = lines[0].strip()
        try:
            year, league = header_line.split(maxsplit=1)
        except ValueError:
            raise ValueError(
                f"Malformed header line in section {section_index + 1}: '{header_line}'.\n"
                f"Content of section {section_index + 1}:\n{section}"
            )
        
        # Extract column headers from the second line
        column_headers = lines[1].strip().split(",")
        
        # Parse data rows
        data_rows = lines[2:]
        for row in data_rows:
            row_values = row.strip().split(",")
            row_dict = dict(zip(column_headers, row_values))
            row_dict["Year"] = year
            row_dict["League"] = league
            
            # Fill missing columns
            for col in all_columns:
                if col not in row_dict:
                    row_dict[col] = np.nan
            
            parsed_data.append(row_dict)
    
    # Create the DataFrame
    final_df = pd.DataFrame(parsed_data, columns=all_columns)
    return final_df


In [3]:
# Path to the data file
file_path = '../raw_data/MLB_Batting.txt'

# Parse the data
df = parse_baseball_data(file_path)

# Display the resulting DataFrame
print(df.head())

# Save to a CSV file (optional)
df.to_csv("../combined_data/batting_combined.csv", index=False)

   Year League                       Tm #Bat BatAge   R/G   G    PA    AB  \
0  1930    NNL  Birmingham Black Barons   30   23.9  4.81  74  2563  2400   
1  1930    NNL  Chicago American Giants   34   28.8  4.35  81  2862  2618   
2  1930    NNL         Cuban Stars West   14   23.9  4.38  48  1731  1575   
3  1930    NNL            Detroit Stars   19   27.4  5.18  72  2557  2364   
4  1930    NNL     Kansas City Monarchs   18   28.2  5.78  67  2524  2285   

     R  ...   SLG   OPS OPS+   TB GDP HBP SH SF IBB LOB  
0  356  ...  .400  .730   97  961       0                
1  352  ...  .343  .671   82  898       3                
2  210  ...  .331  .636   72  522       4                
3  373  ...  .394  .720   94  932       0                
4  387  ...  .398  .748  102  909       4                

[5 rows x 31 columns]
