In [47]:
import pandas as pd
import re
import numpy as np

In [48]:

def parse_baseball_data(file_path):
    # Define the standard columns
    all_columns = [
        "Year", "League", "Rk", "Tm", "W", "L", "W-L%", "GB", "GBsum", "R", "RA",
        "Rdiff", "SOS", "SRS", "pythWL", "Luck", "vEast", "vCent", "vWest", "Inter",
        "Home", "Road", "ExInn", "1Run", "vRHP", "vLHP", "≥.500", "<.500"
    ]
    
    # Read the file and split by double line breaks
    with open(file_path, 'r') as f:
        data = f.read()
    sections = data.split("\n\n")  # Split by double line breaks
    
    parsed_data = []
    
    for section in sections:
        lines = section.strip().split("\n")
        if len(lines) < 2:
            continue  # Skip sections without enough data
        
        # Extract year and league from the first line
        header_line = lines[0]
        year, league = header_line.split(maxsplit=1)
        
        # Extract column headers from the second line
        column_headers = lines[1].split(",")
        
        # Ensure column headers are not included as data rows
        data_rows = lines[2:]
        
        # Parse data rows
        for row in data_rows:
            row_values = row.split(",")
            row_dict = dict(zip(column_headers, row_values))
            row_dict["Year"] = year
            row_dict["League"] = league
            
            # Fill missing columns
            for col in all_columns:
                if col not in row_dict:
                    row_dict[col] = np.nan
            
            parsed_data.append(row_dict)
    
    # Create the DataFrame
    final_df = pd.DataFrame(parsed_data, columns=all_columns)
    return final_df


In [49]:

# Path to the data file
file_path = '../raw_data/MLB_Detailed_Standings.txt'

# Parse the data
df = parse_baseball_data(file_path)
# Display the resulting DataFrame
print(df.head())
# Save to a CSV file (optional)
df.to_csv(file_path, sep='\t', index=False)

   Year League Rk                       Tm   W   L  W-L%    GB GBsum    R  \
0  1930    NNL  1          St. Louis Stars  69  25  .734    --        7.8   
1  1930    NNL  2     Kansas City Monarchs  40  23  .635  13.5  13.5  5.8   
2  1930    NNL  3            Detroit Stars  52  37  .584  14.5  15.5  5.2   
3  1930    NNL  4  Chicago American Giants  53  49  .520  20.0  32.0  4.3   
4  1930    NNL  5  Birmingham Black Barons  46  48  .489  23.0  44.0  4.8   

   ... vWest Inter Home Road ExInn 1Run vRHP vLHP ≥.500 <.500  
0  ...   NaN   NaN  NaN  NaN   NaN  NaN  NaN  NaN   NaN   NaN  
1  ...   NaN   NaN  NaN  NaN   NaN  NaN  NaN  NaN   NaN   NaN  
2  ...   NaN   NaN  NaN  NaN   NaN  NaN  NaN  NaN   NaN   NaN  
3  ...   NaN   NaN  NaN  NaN   NaN  NaN  NaN  NaN   NaN   NaN  
4  ...   NaN   NaN  NaN  NaN   NaN  NaN  NaN  NaN   NaN   NaN  

[5 rows x 28 columns]


In [51]:
df.head()



Unnamed: 0,Year,League,Rk,Tm,W,L,W-L%,GB,GBsum,R,...,vWest,Inter,Home,Road,ExInn,1Run,vRHP,vLHP,≥.500,<.500
0,1930,NNL,1,St. Louis Stars,69,25,0.734,--,,7.8,...,,,,,,,,,,
1,1930,NNL,2,Kansas City Monarchs,40,23,0.635,13.5,13.5,5.8,...,,,,,,,,,,
2,1930,NNL,3,Detroit Stars,52,37,0.584,14.5,15.5,5.2,...,,,,,,,,,,
3,1930,NNL,4,Chicago American Giants,53,49,0.52,20.0,32.0,4.3,...,,,,,,,,,,
4,1930,NNL,5,Birmingham Black Barons,46,48,0.489,23.0,44.0,4.8,...,,,,,,,,,,
