In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import brier_score_loss, mean_squared_error

In [2]:
#폴더 안 csv 파일 읽어오기
import os
import pandas as pd

folder_path = './kaggle_data'  # CSV 파일들이 들어있는 폴더 경로
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

file_info_list = []

for csv_file in csv_files:
    file_path = os.path.join(folder_path, csv_file)
    
    # (1) 전체 CSV를 읽지 않고 헤더만 읽을 수도 있음
    #     nrows=0 옵션을 주면 헤더만 읽어서 컬럼명만 가져올 수 있음
    df_header = pd.read_csv(file_path, nrows=0, encoding='cp949')
    
    # (2) 컬럼 이름 추출
    columns = df_header.columns.tolist()
    
    # (3) 각 파일의 정보(파일명, 컬럼 리스트)를 저장
    file_info_list.append({
        'file_name': csv_file,
        'columns': columns
    })

# 이제 file_info_list에는 [{'file_name':..., 'columns':[...]}, ...] 형태로 저장됨
summary_df = pd.DataFrame(file_info_list)
summary_df


Unnamed: 0,file_name,columns
0,Cities.csv,"[CityID, City, State]"
1,Conferences.csv,"[ConfAbbrev, Description]"
2,MConferenceTourneyGames.csv,"[Season, ConfAbbrev, DayNum, WTeamID, LTeamID]"
3,MGameCities.csv,"[Season, DayNum, WTeamID, LTeamID, CRType, Cit..."
4,MMasseyOrdinals.csv,"[Season, RankingDayNum, SystemName, TeamID, Or..."
5,MNCAATourneyCompactResults.csv,"[Season, DayNum, WTeamID, WScore, LTeamID, LSc..."
6,MNCAATourneyDetailedResults.csv,"[Season, DayNum, WTeamID, WScore, LTeamID, LSc..."
7,MNCAATourneySeedRoundSlots.csv,"[Seed, GameRound, GameSlot, EarlyDayNum, LateD..."
8,MNCAATourneySeeds.csv,"[Season, Seed, TeamID]"
9,MNCAATourneySlots.csv,"[Season, Slot, StrongSeed, WeakSeed]"


In [3]:
#파일 추출
summary_df['columns_str'] = summary_df['columns'].apply(lambda lst: ', '.join(lst))
summary_df[['file_name', 'columns_str']].to_csv('file_column_summary_default.csv', index=False)


In [4]:
#Team과 Team ID 매칭 남자 Part

# 📌 1. 남자부 경기 데이터 로드
m_games_df = pd.concat([
    pd.read_csv("./kaggle_data/MRegularSeasonCompactResults.csv"),
    pd.read_csv("./kaggle_data/MNCAATourneyCompactResults.csv")
], ignore_index=True)

# 📌 2. 남자부 팀 정보 로드
m_teams_df = pd.read_csv("./kaggle_data/MTeams.csv")

# 📌 3. 팀ID → 팀이름 매핑
m_games_df = m_games_df.merge(m_teams_df[['TeamID', 'TeamName']], left_on="WTeamID", right_on="TeamID", how="left")
m_games_df = m_games_df.rename(columns={"TeamName": "WTeamName"}).drop(columns=["TeamID"])

m_games_df = m_games_df.merge(m_teams_df[['TeamID', 'TeamName']], left_on="LTeamID", right_on="TeamID", how="left")
m_games_df = m_games_df.rename(columns={"TeamName": "LTeamName"}).drop(columns=["TeamID"])

# 📌 4. 시드 정보 추가
m_seeds_df = pd.read_csv("./kaggle_data/MNCAATourneySeeds.csv")
m_games_df = m_games_df.merge(m_seeds_df[['Season', 'TeamID', 'Seed']], left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'], how="left")
m_games_df = m_games_df.rename(columns={"Seed": "WSeed"}).drop(columns=["TeamID"])

m_games_df = m_games_df.merge(m_seeds_df[['Season', 'TeamID', 'Seed']], left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'], how="left")
m_games_df = m_games_df.rename(columns={"Seed": "LSeed"}).drop(columns=["TeamID"])

# 📌 5. 성별 컬럼 추가
m_games_df["Sex"] = "M"

# ✅ 저장
# m_games_df.to_csv("merged_men_games.csv", index=False)



In [5]:
m_teams_df = pd.read_csv("./kaggle_data/MTeams.csv")
import pandas as pd

file_path = "./kaggle_data/SampleSubmissionStage1.csv"

# 파일의 첫 5줄만 읽기
df = pd.read_csv(file_path, nrows=5)

# 파일 크기 확인
print(f"파일 크기: {df.shape[0]} 행, {df.shape[1]} 열")
print(df.head())

파일 크기: 5 행, 2 열
               ID  Pred
0  2021_1101_1102   0.5
1  2021_1101_1103   0.5
2  2021_1101_1104   0.5
3  2021_1101_1105   0.5
4  2021_1101_1106   0.5


In [6]:
import os
import pandas as pd

folder_path = "./kaggle_data/"  # CSV 파일이 있는 폴더 경로
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# CSV 파일을 자동으로 변수에 할당
for csv_file in csv_files:
    file_path = os.path.join(folder_path, csv_file)
    
    # 파일명을 변수명으로 변환 (확장자 제거하고 _로 치환)
    var_name = os.path.splitext(csv_file)[0].replace(" ", "_").replace("-", "_").lower()
    
    # CSV 읽어서 변수에 할당
    globals()[var_name] = pd.read_csv(file_path, encoding='cp949')

    print(f"Loaded {csv_file} as variable: {var_name}")

# 예시 출력


Loaded Cities.csv as variable: cities
Loaded Conferences.csv as variable: conferences
Loaded MConferenceTourneyGames.csv as variable: mconferencetourneygames
Loaded MGameCities.csv as variable: mgamecities
Loaded MMasseyOrdinals.csv as variable: mmasseyordinals
Loaded MNCAATourneyCompactResults.csv as variable: mncaatourneycompactresults
Loaded MNCAATourneyDetailedResults.csv as variable: mncaatourneydetailedresults
Loaded MNCAATourneySeedRoundSlots.csv as variable: mncaatourneyseedroundslots
Loaded MNCAATourneySeeds.csv as variable: mncaatourneyseeds
Loaded MNCAATourneySlots.csv as variable: mncaatourneyslots
Loaded MRegularSeasonCompactResults.csv as variable: mregularseasoncompactresults
Loaded MRegularSeasonDetailedResults.csv as variable: mregularseasondetailedresults
Loaded MSeasons.csv as variable: mseasons
Loaded MSecondaryTourneyCompactResults.csv as variable: msecondarytourneycompactresults
Loaded MSecondaryTourneyTeams.csv as variable: msecondarytourneyteams
Loaded MTeamCoac

In [7]:
import os
import re
import pandas as pd

folder_path = "./kaggle_data/"  # CSV 파일이 있는 폴더 경로

# 폴더 존재 여부 확인
if not os.path.exists(folder_path):
    print(f"❌ Error: Folder '{folder_path}' not found.")
else:
    # CSV 파일 목록 가져오기
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

    # 데이터 저장을 위한 딕셔너리 생성
    data_dict = {}

    # CSV 파일을 자동으로 변수에 할당
    for csv_file in csv_files:
        file_path = os.path.join(folder_path, csv_file)
        
        # 안전한 변수명 생성 (공백, 특수문자 제거)
        var_name = re.sub(r'[^a-zA-Z0-9_]', '_', os.path.splitext(csv_file)[0]).lower()
        
        try:
            # 여러 인코딩 시도 (utf-8, latin1, cp949)
            for encoding in ["utf-8", "latin1", "cp949"]:
                try:
                    df = pd.read_csv(file_path, encoding=encoding)
                    break  # 정상적으로 로드되면 반복 종료
                except UnicodeDecodeError:
                    continue  # 다른 인코딩으로 재시도
            
            # 데이터 저장 (딕셔너리 방식)
            data_dict[var_name] = df
            print(f"✅ Loaded {csv_file} as variable: data_dict['{var_name}']")

        except Exception as e:
            print(f"❌ Error loading {csv_file}: {e}")

    # 📌 데이터 목록 출력
    print("\n📌 Available datasets:")
    for key, value in data_dict.items():
        print(f"  - {key}: {value.shape} columns: {list(value.columns)}")

    # ✅ 예제: 특정 데이터 접근
    # df_regular_season = data_dict["mregularseasondetailedresults"]


✅ Loaded Cities.csv as variable: data_dict['cities']
✅ Loaded Conferences.csv as variable: data_dict['conferences']
✅ Loaded MConferenceTourneyGames.csv as variable: data_dict['mconferencetourneygames']
✅ Loaded MGameCities.csv as variable: data_dict['mgamecities']
✅ Loaded MMasseyOrdinals.csv as variable: data_dict['mmasseyordinals']
✅ Loaded MNCAATourneyCompactResults.csv as variable: data_dict['mncaatourneycompactresults']
✅ Loaded MNCAATourneyDetailedResults.csv as variable: data_dict['mncaatourneydetailedresults']
✅ Loaded MNCAATourneySeedRoundSlots.csv as variable: data_dict['mncaatourneyseedroundslots']
✅ Loaded MNCAATourneySeeds.csv as variable: data_dict['mncaatourneyseeds']
✅ Loaded MNCAATourneySlots.csv as variable: data_dict['mncaatourneyslots']
✅ Loaded MRegularSeasonCompactResults.csv as variable: data_dict['mregularseasoncompactresults']
✅ Loaded MRegularSeasonDetailedResults.csv as variable: data_dict['mregularseasondetailedresults']
✅ Loaded MSeasons.csv as variable: d

In [8]:
print(mconferencetourneygames.head())


   Season ConfAbbrev  DayNum  WTeamID  LTeamID
0    2001      a_sun     121     1194     1144
1    2001      a_sun     121     1416     1240
2    2001      a_sun     122     1209     1194
3    2001      a_sun     122     1359     1239
4    2001      a_sun     122     1391     1273


In [9]:
# 데이터 딕셔너리에서 필요한 데이터 가져오기
regular_season_df = data_dict.get("mregularseasondetailedresults", None)
conference_tourney_df = data_dict.get("mconferencetourneygames", None)
ncaa_tourney_df = data_dict.get("mncaatourneydetailedresults", None)

# 데이터가 정상적으로 로드되었는지 확인
if regular_season_df is None or conference_tourney_df is None or ncaa_tourney_df is None:
    raise ValueError("❌ 필요한 데이터가 로드되지 않았습니다. 데이터 딕셔너리를 확인하세요.")

# 사용할 공통 컬럼 (컨퍼런스 토너먼트 데이터에는 일부 컬럼이 없음)
common_columns = [
    "Season", "DayNum", "WTeamID", "WScore", "LTeamID", "LScore", "WLoc", "NumOT",
    "WFGM", "WFGA", "WFGM3", "WFGA3", "WFTM", "WFTA", "WOR", "WDR", "WAst", "WTO", 
    "WStl", "WBlk", "WPF", "LFGM", "LFGA", "LFGM3", "LFGA3", "LFTM", "LFTA", "LOR", 
    "LDR", "LAst", "LTO", "LStl", "LBlk", "LPF"
]

# 컨퍼런스 토너먼트 데이터에 부족한 컬럼을 추가 (0으로 채움)
conference_tourney_expanded = conference_tourney_df.copy()
conference_tourney_expanded["WScore"] = 0
conference_tourney_expanded["LScore"] = 0
conference_tourney_expanded["WLoc"] = "N"  # 중립 경기장 가정
conference_tourney_expanded["NumOT"] = 0

# 나머지 컬럼을 0으로 채우기
for col in common_columns:
    if col not in conference_tourney_expanded.columns:
        conference_tourney_expanded[col] = 0

# 컬럼 순서 맞추기
conference_tourney_expanded = conference_tourney_expanded[common_columns]

# 각 데이터셋에 Tournament_Type 추가
regular_season_df["Tournament_Type"] = "Regular"
conference_tourney_expanded["Tournament_Type"] = "Conference"
ncaa_tourney_df["Tournament_Type"] = "NCAA"

# 데이터 통합 (행 기준 결합)
merged_data = pd.concat([
    regular_season_df, 
    conference_tourney_expanded, 
    ncaa_tourney_df
], ignore_index=True)

# 최신 3년(2023~2025) 데이터만 필터링
latest_season = 2025
selected_seasons = [latest_season, latest_season - 1, latest_season - 2]
filtered_data = merged_data[merged_data["Season"].isin(selected_seasons)]

# 데이터 확인

print(filtered_data.head())

        Season  DayNum  WTeamID  WScore  LTeamID  LScore WLoc  NumOT  WFGM  \
102032    2023       7     1101      65     1238      56    H      0    23   
102033    2023       7     1103      81     1355      80    H      1    30   
102034    2023       7     1104      75     1255      54    H      0    27   
102035    2023       7     1112     117     1311      75    H      0    38   
102036    2023       7     1113      62     1470      59    H      0    21   

        WFGA  ...  LFTM  LFTA  LOR  LDR  LAst  LTO  LStl  LBlk  LPF  \
102032    57  ...    10    14    7   33     9   21     6     1   21   
102033    69  ...    14    17    5   28    11   12     3     4   15   
102034    69  ...    11    19   12   23     8    9     8     3   22   
102035    53  ...    10    22    8   14    11   16    17     1   29   
102036    62  ...    19    32    5   28    10   17    12     4   27   

        Tournament_Type  
102032          Regular  
102033          Regular  
102034          Regular  


In [10]:
# MTeamSpellings 데이터 가져오기 (팀 ID와 팀명을 매칭하는 데이터)
team_spellings_df = data_dict.get("mteamspellings", None)

# 데이터가 정상적으로 로드되었는지 확인
if team_spellings_df is None:
    raise ValueError("❌ MTeamSpellings 데이터가 로드되지 않았습니다.")

# TeamID와 TeamName 매핑 생성
team_id_map = team_spellings_df.groupby("TeamID")["TeamNameSpelling"].first().to_dict()

# 🎯 새로운 데이터프레임을 만들면서 WTeamName, LTeamName을 원하는 위치에 삽입
filtered_data = filtered_data.assign(
    WTeamName=filtered_data["WTeamID"].map(team_id_map),
    LTeamName=filtered_data["LTeamID"].map(team_id_map)
)[[
    "Season", "DayNum", 
    "WTeamID", "WTeamName", "WScore", 
    "LTeamID", "LTeamName", "LScore", 
    "WLoc", "NumOT"
] + [col for col in filtered_data.columns if col not in ["Season", "DayNum", "WTeamID", "WTeamName", "WScore", "LTeamID", "LTeamName", "LScore", "WLoc", "NumOT"]]]

# 데이터 확인
print(filtered_data.head())


        Season  DayNum  WTeamID    WTeamName  WScore  LTeamID    LTeamName  \
102032    2023       7     1101  abilene chr      65     1238   jackson st   
102033    2023       7     1103        akron      81     1355  s dakota st   
102034    2023       7     1104      alabama      75     1255     longwood   
102035    2023       7     1112      arizona     117     1311     nicholls   
102036    2023       7     1113   arizona st      62     1470  tarleton st   

        LScore WLoc  NumOT  ...  LFTM  LFTA  LOR  LDR  LAst  LTO  LStl  LBlk  \
102032      56    H      0  ...    10    14    7   33     9   21     6     1   
102033      80    H      1  ...    14    17    5   28    11   12     3     4   
102034      54    H      0  ...    11    19   12   23     8    9     8     3   
102035      75    H      0  ...    10    22    8   14    11   16    17     1   
102036      59    H      0  ...    19    32    5   28    10   17    12     4   

        LPF  Tournament_Type  
102032   21        

In [12]:
import pandas as pd
import re

# ✅ 파일 로드 (로컬 경로 변경 필요)
folder_path = "./kaggle_data/"  # 파일이 있는 폴더 경로
seeds_df = pd.read_csv(f"{folder_path}MNCAATourneySeeds.csv")

# ✅ 데이터 타입 변환 (Season & TeamID)
seeds_df["Season"] = seeds_df["Season"].astype(int)
seeds_df["TeamID"] = seeds_df["TeamID"].astype(int)
filtered_data["Season"] = filtered_data["Season"].astype(int)
filtered_data["WTeamID"] = filtered_data["WTeamID"].astype(int)
filtered_data["LTeamID"] = filtered_data["LTeamID"].astype(int)

# ✅ 시드 정보 전처리
# 지역 코드(W, X, Y, Z) 추출
seeds_df["SeedRegion"] = seeds_df["Seed"].str[0]  # 첫 번째 문자 추출
# 시드 번호(숫자)만 추출 (예: "W01" → 1)
seeds_df["SeedNum"] = seeds_df["Seed"].apply(lambda x: int(re.sub(r"\D", "", x)))

# ✅ 매칭이 정상적으로 이루어지는지 확인
print("✅ filtered_data Seasons:", filtered_data["Season"].unique())
print("✅ seeds_df Seasons:", seeds_df["Season"].unique())

# ✅ WTeamID(승리 팀), LTeamID(패배 팀) 기준으로 시드 정보 추가 (SeedRegion + SeedNum)
filtered_data = filtered_data.merge(
    seeds_df[["Season", "TeamID", "SeedNum", "SeedRegion"]],
    left_on=["Season", "WTeamID"], right_on=["Season", "TeamID"], how="left"
).rename(columns={"SeedNum": "WTeamSeed", "SeedRegion": "WTeamSeedRegion"}).drop(columns=["TeamID"])

filtered_data = filtered_data.merge(
    seeds_df[["Season", "TeamID", "SeedNum", "SeedRegion"]],
    left_on=["Season", "LTeamID"], right_on=["Season", "TeamID"], how="left"
).rename(columns={"SeedNum": "LTeamSeed", "SeedRegion": "LTeamSeedRegion"}).drop(columns=["TeamID"])

# ✅ 데이터 확인
print(filtered_data.head())



✅ filtered_data Seasons: [2023 2024 2025]
✅ seeds_df Seasons: [1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998
 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012
 2013 2014 2015 2016 2017 2018 2019 2021 2022 2023 2024]
   Season  DayNum  WTeamID    WTeamName  WScore  LTeamID    LTeamName  LScore  \
0    2023       7     1101  abilene chr      65     1238   jackson st      56   
1    2023       7     1103        akron      81     1355  s dakota st      80   
2    2023       7     1104      alabama      75     1255     longwood      54   
3    2023       7     1112      arizona     117     1311     nicholls      75   
4    2023       7     1113   arizona st      62     1470  tarleton st      59   

  WLoc  NumOT  ...  LPF  Tournament_Type  WTeamSeed  WTeamSeedRegion  \
0    H      0  ...   21          Regular        NaN              NaN   
1    H      1  ...   15          Regular        NaN              NaN   
2    H      0  ...   22          Reg

In [14]:

folder_path = "./kenpom/"  # CSV 파일이 있는 폴더 경로
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

if not os.path.exists(folder_path):
    print(f"❌ Error: Folder '{folder_path}' not found.")
else:
    # CSV 파일 목록 가져오기
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

    # 데이터 저장을 위한 딕셔너리 생성
    data_dict = {}

    # CSV 파일을 자동으로 변수에 할당
    for csv_file in csv_files:
        file_path = os.path.join(folder_path, csv_file)
        
        # 안전한 변수명 생성 (공백, 특수문자 제거)
        var_name = re.sub(r'[^a-zA-Z0-9_]', '_', os.path.splitext(csv_file)[0]).lower()
        
        try:
            # 여러 인코딩 시도 (utf-8, latin1, cp949)
            for encoding in ["utf-8", "latin1", "cp949"]:
                try:
                    df = pd.read_csv(file_path, encoding=encoding)
                    break  # 정상적으로 로드되면 반복 종료
                except UnicodeDecodeError:
                    continue  # 다른 인코딩으로 재시도
            
            # 데이터 저장 (딕셔너리 방식)
            data_dict[var_name] = df
            print(f"✅ Loaded {csv_file} as variable: data_dict['{var_name}']")

        except Exception as e:
            print(f"❌ Error loading {csv_file}: {e}")

    # 📌 데이터 목록 출력
    print("\n📌 Available datasets:")
    for key, value in data_dict.items():
        print(f"  - {key}: {value.shape} columns: {list(value.columns)}")


✅ Loaded 2020.csv as variable: data_dict['2020']
✅ Loaded 2021.csv as variable: data_dict['2021']
✅ Loaded 2022.csv as variable: data_dict['2022']
✅ Loaded 2023.csv as variable: data_dict['2023']
✅ Loaded 2024.csv as variable: data_dict['2024']
✅ Loaded 2025.csv as variable: data_dict['2025']

📌 Available datasets:
  - 2020: (353, 21) columns: ['Rk', 'Team', 'Conf', 'W - L', 'NetRtg', 'ORtg', 'Unnamed: 6', 'DRtg', 'Unnamed: 8', 'AdjT', 'Unnamed: 10', 'Luck', 'Unnamed: 12', 'Strength of Schedule NetRtg', 'Unnamed: 14', 'ORtg.1', 'Unnamed: 16', 'DRtg.1', 'Unnamed: 18', 'NCSOS NetRtg', 'Unnamed: 20']
  - 2021: (357, 21) columns: ['Rk', 'Team', 'Conf', 'W - L', 'NetRtg', 'ORtg', 'Unnamed: 6', 'DRtg', 'Unnamed: 8', 'AdjT', 'Unnamed: 10', 'Luck', 'Unnamed: 12', 'Strength of Schedule NetRtg', 'Unnamed: 14', 'ORtg.1', 'Unnamed: 16', 'DRtg.1', 'Unnamed: 18', 'NCSOS NetRtg', 'Unnamed: 20']
  - 2022: (358, 21) columns: ['Rk', 'Team', 'Conf', 'W - L', 'NetRtg', 'ORtg', 'Unnamed: 6', 'DRtg', 'Unnam

In [16]:
data_dict.keys()

dict_keys(['2020', '2021', '2022', '2023', '2024', '2025'])

In [17]:
try:
    df = data_dict["2025"]
    print(df.head())  # 데이터 확인
except KeyError:
    print("❌ '2025' 키가 존재하지 않습니다.")


   Rk       Team Conf W - L  NetRtg   ORtg  Unnamed: 6  DRtg  Unnamed: 8  \
0   1     Auburn  SEC  23-2   36.67  130.9           1  94.2          13   
1   2       Duke  ACC  23-3   36.39  127.4           2  91.0           4   
2   3    Houston  B12  22-4   35.20  125.3           7  90.1           3   
3   4    Florida  SEC  23-3   33.57  126.5           5  92.9           7   
4   5  Tennessee  SEC  21-5   31.01  118.3          30  87.3           1   

   AdjT  ...   Luck  Unnamed: 12  Strength of Schedule NetRtg  Unnamed: 14  \
0  68.4  ...  0.064           56                        17.00            1   
1  65.6  ... -0.040          286                         8.26           56   
2  60.9  ... -0.029          258                        10.84           32   
3  69.3  ...  0.011          155                        10.03           39   
4  63.8  ...  0.006          164                        12.34           18   

   ORtg.1  Unnamed: 16  DRtg.1  Unnamed: 18  NCSOS NetRtg  Unnamed: 20  
0