In [11]:
import pandas as pd
import re
import numpy as np

In [12]:

def parse_baseball_data(file_path):
    # Define the standard columns
    all_columns = [
        "Year", "League", "Tm","#P","PAge","RA/G","W","L","W-L%","ERA","G","GS","GF","CG","tSho","cSho","SV","IP","H","R","ER","HR","BB","IBB","SO","HBP","BK","WP","BF","ERA+","FIP","WHIP","H9","HR9","BB9","SO9","SO/W","LOB"
    ]
    
    # Read the file and split by double line breaks
    with open(file_path, 'r') as f:
        data = f.read()
    sections = data.split("\n\n")  # Split by double line breaks
    
    parsed_data = []
    
    for section in sections:
        lines = section.strip().split("\n")
        if len(lines) < 2:
            continue  # Skip sections without enough data
        
        # Extract year and league from the first line
        header_line = lines[0]
        year, league = header_line.split(maxsplit=1)
        
        # Extract column headers from the second line
        column_headers = lines[1].split(",")
        
        # Ensure column headers are not included as data rows
        data_rows = lines[2:]
        
        # Parse data rows
        for row in data_rows:
            row_values = row.split(",")
            row_dict = dict(zip(column_headers, row_values))
            row_dict["Year"] = year
            row_dict["League"] = league
            
            # Fill missing columns
            for col in all_columns:
                if col not in row_dict:
                    row_dict[col] = np.nan
            
            parsed_data.append(row_dict)
    
    # Create the DataFrame
    final_df = pd.DataFrame(parsed_data, columns=all_columns)
    return final_df


In [13]:

# Path to the data file
file_path = '../raw_data/MLB_Pitching.txt'

# Parse the data
df = parse_baseball_data(file_path)
df['League'] = df['League'].str.replace('Season: ', '')
# Display the resulting DataFrame
print(df.head())

# Save to a CSV file (optional)
df.to_csv("../combined_data/pitching_combined.csv", index=False)

   Year League                       Tm  #P  PAge  RA/G   W   L  W-L%   ERA  \
0  1930    NNL  Birmingham Black Barons  11  26.9  5.20  38  35  .521  4.95   
1  1930    NNL  Chicago American Giants  18  25.1  5.73  32  50  .390  4.84   
2  1930    NNL         Cuban Stars West   7  23.7  4.67  19  29  .396  3.89   
3  1930    NNL            Detroit Stars   6  22.6  4.74  38  34  .528  4.08   
4  1930    NNL     Kansas City Monarchs   7  25.2  4.82  38  29  .567  4.30   

   ... BF ERA+ FIP   WHIP    H9 HR9  BB9  SO9  SO/W LOB  
0  ...      96      1.327   9.5      2.5  4.5  1.83      
1  ...      99      1.490  10.0      3.4  4.2  1.24      
2  ...     123      1.371   9.4      2.9  4.0  1.37      
3  ...     116      1.232   9.1      2.0  4.0  1.96      
4  ...     111      1.381   9.6      2.8  5.6  1.97      

[5 rows x 38 columns]


In [14]:
df.head()

Unnamed: 0,Year,League,Tm,#P,PAge,RA/G,W,L,W-L%,ERA,...,BF,ERA+,FIP,WHIP,H9,HR9,BB9,SO9,SO/W,LOB
0,1930,NNL,Birmingham Black Barons,11,26.9,5.2,38,35,0.521,4.95,...,,96,,1.327,9.5,,2.5,4.5,1.83,
1,1930,NNL,Chicago American Giants,18,25.1,5.73,32,50,0.39,4.84,...,,99,,1.49,10.0,,3.4,4.2,1.24,
2,1930,NNL,Cuban Stars West,7,23.7,4.67,19,29,0.396,3.89,...,,123,,1.371,9.4,,2.9,4.0,1.37,
3,1930,NNL,Detroit Stars,6,22.6,4.74,38,34,0.528,4.08,...,,116,,1.232,9.1,,2.0,4.0,1.96,
4,1930,NNL,Kansas City Monarchs,7,25.2,4.82,38,29,0.567,4.3,...,,111,,1.381,9.6,,2.8,5.6,1.97,
