# functions and imports

In [43]:
from ollama import chat

def stream_function(question):
    stream = chat(
        model='llama3.2:1b',
        messages=[{'role': 'user', 'content': question}],
        stream=True,
    )

    for chunk in stream:
        print(chunk['message']['content'], end='', flush=True)

In [7]:
from kaggle.api.kaggle_api_extended import KaggleApi

# Initialize the Kaggle API
api = KaggleApi()
api.authenticate()

# Define the dataset path (this is the URL path of the dataset on Kaggle)
dataset_name = "jonathanpilafas/2024-march-madness-statistical-analysis"


# Download the dataset to your current working directory
api.dataset_download_files(dataset_name, path='.', unzip=True)

print("Dataset downloaded successfully!")


Dataset URL: https://www.kaggle.com/datasets/jonathanpilafas/2024-march-madness-statistical-analysis
Dataset downloaded successfully!


In [23]:
import pandas as pd
import zipfile
import re

# Create a dictionary to store your dataframes
dataframes = {}

# Your zip filename

match = re.search(r'/(?<=/)(.*)', dataset_name)
if match:
    result = match.group(1)
zip_filename = result + '.zip'

with zipfile.ZipFile(zip_filename, 'r') as zip_file:
    # Show what files are in the ZIP
    print("Files in the ZIP:", zip_file.namelist())
    
    # Read each CSV file from the ZIP into a DataFrame
    for filename in zip_file.namelist():
        if filename.endswith('.csv'):
            # Create a clean key by replacing spaces and special characters
            key = filename.replace('.csv', '').replace(' _ ', '_').replace(' ', '_')
            
            # Read CSV with low_memory=False to handle mixed types warning
            dataframes[key] = pd.read_csv(zip_file.open(filename), low_memory=False)
            print(f"Loaded: {filename}")




Files in the ZIP: ['DEV _ March Madness.csv', 'INT _ KenPom _ Defense.csv', 'INT _ KenPom _ Efficiency.csv', 'INT _ KenPom _ Height.csv', 'INT _ KenPom _ Miscellaneous Team Stats.csv', 'INT _ KenPom _ Offense.csv', 'INT _ KenPom _ Point Distribution.csv', 'INT _ KenPom _ Summary.csv', 'REF _ 2024 Post-Season Tournament Teams.csv', 'REF _ Current NCAAM Coaches.csv']
Loaded: DEV _ March Madness.csv
Loaded: INT _ KenPom _ Defense.csv
Loaded: INT _ KenPom _ Efficiency.csv
Loaded: INT _ KenPom _ Height.csv
Loaded: INT _ KenPom _ Miscellaneous Team Stats.csv
Loaded: INT _ KenPom _ Offense.csv
Loaded: INT _ KenPom _ Point Distribution.csv
Loaded: INT _ KenPom _ Summary.csv
Loaded: REF _ 2024 Post-Season Tournament Teams.csv
Loaded: REF _ Current NCAAM Coaches.csv


In [18]:
# Create individual variables for easier access
march_madness = dataframes['DEV_March_Madness']
kenpom_defense = dataframes['INT_KenPom_Defense']
kenpom_efficiency = dataframes['INT_KenPom_Efficiency']
kenpom_height = dataframes['INT_KenPom_Height']
kenpom_misc_stats = dataframes['INT_KenPom_Miscellaneous_Team_Stats']
kenpom_offense = dataframes['INT_KenPom_Offense']
kenpom_point_dist = dataframes['INT_KenPom_Point_Distribution']
kenpom_summary = dataframes['INT_KenPom_Summary']
tournament_teams = dataframes['REF_2024_Post-Season_Tournament_Teams']
current_coaches = dataframes['REF_Current_NCAAM_Coaches']

# Show what dataframes were created with more detail
print("\nAvailable dataframes:")
for key, df in dataframes.items():
    print(f"- {key}:")
    print(f"  Rows: {df.shape[0]}, Columns: {df.shape[1]}")
    print(f"  Memory usage: {df.memory_usage().sum() / 1024**2:.2f} MB")


Available dataframes:
- DEV_March_Madness:
  Rows: 8314, Columns: 147
  Memory usage: 9.32 MB
- INT_KenPom_Defense:
  Rows: 9268, Columns: 10
  Memory usage: 0.71 MB
- INT_KenPom_Efficiency:
  Rows: 9268, Columns: 19
  Memory usage: 1.34 MB
- INT_KenPom_Height:
  Rows: 6670, Columns: 50
  Memory usage: 2.54 MB
- INT_KenPom_Miscellaneous_Team_Stats:
  Rows: 8314, Columns: 35
  Memory usage: 2.22 MB
- INT_KenPom_Offense:
  Rows: 9268, Columns: 10
  Memory usage: 0.71 MB
- INT_KenPom_Point_Distribution:
  Rows: 9269, Columns: 14
  Memory usage: 0.99 MB
- INT_KenPom_Summary:
  Rows: 9268, Columns: 16
  Memory usage: 1.13 MB
- REF_2024_Post-Season_Tournament_Teams:
  Rows: 131, Columns: 6
  Memory usage: 0.01 MB
- REF_Current_NCAAM_Coaches:
  Rows: 364, Columns: 4
  Memory usage: 0.01 MB


In [27]:
for name, df in dataframes.items():
    print(f"{name} - Shape: {df.shape}")
    print(df.head(), "\n")


DEV_March_Madness - Shape: (8314, 147)
   Season Short Conference Name  Adjusted Temo  Adjusted Tempo Rank  \
0    2015                   SEC           62.4                  274   
1    2021                   WCC           73.8                    7   
2    2024                    BE           64.6                  330   
3    2025                   SEC           67.8                  143   
4    2025                   ACC           65.3                  268   

   Raw Tempo  Raw Tempo Rank  Adjusted Offensive Efficiency  \
0       63.8             242                          121.3   
1       74.3              14                          126.4   
2       66.0             305                          127.5   
3       68.6             164                          130.4   
4       66.4             276                          124.0   

   Adjusted Offensive Efficiency Rank  Raw Offensive Efficiency  \
0                                   6                     115.5   
1                    

In [28]:
pd.set_option('display.max_columns', None)


# March Madness Historical Dataset, Model Selection

Model Selection:
- Season (Last 6 years)
- Adjusted Tempo (Ranked offensive field goal efficiency percentage)
    -Should create a YoY%
-Adjusted Tempo Rank
    -Offensive turnover percentage

In [35]:
march_madness[(march_madness['Mapped ESPN Team Name']=='Kentucky') & (march_madness['Season'] >=2020)]

Unnamed: 0,Season,Short Conference Name,Adjusted Temo,Adjusted Tempo Rank,Raw Tempo,Raw Tempo Rank,Adjusted Offensive Efficiency,Adjusted Offensive Efficiency Rank,Raw Offensive Efficiency,Raw Offensive Efficiency Rank,Adjusted Defensive Efficiency,Adjusted Defensive Efficiency Rank,Raw Defensive Efficiency,Raw Defensive Efficiency Rank,Avg Possession Length (Offense),Avg Possession Length (Offense) Rank,Avg Possession Length (Defense),Avg Possession Length (Defense) Rank,eFGPct,RankeFGPct,TOPct,RankTOPct,ORPct,RankORPct,FTRate,RankFTRate,OffFT,RankOffFT,Off2PtFG,RankOff2PtFG,Off3PtFG,RankOff3PtFG,DefFT,RankDefFT,Def2PtFG,RankDef2PtFG,Def3PtFG,RankDef3PtFG,Tempo,RankTempo,AdjTempo,RankAdjTempo,OE,RankOE,AdjOE,RankAdjOE,DE,RankDE,AdjDE,RankAdjDE,AdjEM,RankAdjEM,FG2Pct,RankFG2Pct,FG3Pct,RankFG3Pct,FTPct,RankFTPct,BlockPct,RankBlockPct,OppFG2Pct,RankOppFG2Pct,OppFG3Pct,RankOppFG3Pct,OppFTPct,RankOppFTPct,OppBlockPct,RankOppBlockPct,FG3Rate,RankFG3Rate,OppFG3Rate,RankOppFG3Rate,ARate,RankARate,OppARate,RankOppARate,StlRate,RankStlRate,OppStlRate,RankOppStlRate,DFP,NSTRate,RankNSTRate,OppNSTRate,RankOppNSTRate,AvgHeight,RankAvgHeight,CenterHeight,RankCenterHeight,PFHeight,RankPFHeight,SFHeight,RankSFHeight,SGHeight,RankSGHeight,PGHeight,RankPGHeight,EffectiveHeight,RankEffectiveHeight,Experience,RankExperience,Bench,BenchRank,CenterPts,RankCenterPts,PFPts,RankPFPts,SFPts,RankSFPts,SGPts,RankSGPts,PGPts,RankPGPts,CenterOR,RankCenterOR,PFOR,RankPFOR,SFOR,RankSFOR,SGOR,RankSGOR,PGOR,RankPGOR,CenterDR,RankCenterDR,PFDR,RankPFDR,SFDR,RankSFDR,SGDR,RankSGDR,PGDR,RankPGDR,Net Rating,Net Rating Rank,Mapped Conference Name,Mapped ESPN Team Name,Current Coach,Full Team Name,Since,Active Coaching Length,Active Coaching Length Index,Seed,Region,Correct Team Name?,Post-Season Tournament,Post-Season Tournament Sorting Index
141,2022,SEC,67.5,150,69.3,105,120.0,5,114.1,9,94.3,36,95.9,41,16.9,89.0,17.7,206.0,53.116214,44,16.653579,61,37.671233,4,28.022493,248,16.068123,278,60.940392,3,22.991485,350,16.349978,257,54.087494,73,29.562528,240,69.2574,105,67.4665,150,114.128,9,120.001,5,95.9264,41,94.2785,36,25.7224,6,53.511053,45,34.731544,101,72.575251,149,11.119691,69,47.258687,73,30.843707,41,72.124756,198,6.501951,16,27.928772,351,35.827552,114,53.398058,109,46.826347,77,0.097813,129,0.086008,94,,8.052743,77.0,7.715456,307.0,76.83,188.0,-0.16,177.0,1.15,50.0,0.36,124.0,-0.06,174.0,-1.92,322.0,0.49,105.0,1.87,186.0,27.52,252.0,20.57,151.0,24.84,31.0,17.7,249.0,18.95,224.0,17.95,254.0,36.17,171.0,39.26,18.0,8.88,327.0,8.6,254.0,7.08,209.0,32.95,21.0,33.04,7.0,10.79,356.0,11.62,341.0,11.6,302.0,25.7,141,Southeastern Conference (SEC),Kentucky,Mark Pope,Kentucky Wildcats,202425.0,0 years,1.0,Not In a Post-Season Tournament,Not In a Post-Season Tournament,,Not In a Post-Season Tournament,5
288,2025,SEC,70.5,38,72.5,22,124.7,3,119.3,9,102.1,89,105.1,167,16.0,39.0,17.1,81.0,56.508653,18,14.626303,24,30.860927,163,34.161023,147,17.968323,244,46.968869,258,35.062807,92,17.932752,248,48.443337,222,33.62391,101,72.5366,22,70.4937,38,119.285,9,124.727,3,105.129,167,102.102,89,22.625,22,56.504599,26,37.676056,33,72.46696,172,12.734584,45,52.144772,231,29.70297,22,70.761671,126,8.147175,84,42.738901,93,44.822485,331,57.763975,58,47.100176,69,0.089456,240,0.080967,30,,6.529599,57.0,5.027792,364.0,78.56,22.0,1.19,59.0,1.55,38.0,1.42,31.0,1.37,46.0,1.2,75.0,1.37,33.0,3.19647,4.0,36.55,68.0,16.99,252.0,20.92,136.0,20.93,135.0,21.57,144.0,19.5,197.0,32.1,249.0,31.73,72.0,7.51,352.0,13.0,121.0,15.56,26.0,22.85,269.0,31.87,9.0,16.61,270.0,16.01,187.0,12.57,266.0,22.6,286,Southeastern Conference (SEC),Kentucky,Mark Pope,Kentucky Wildcats,202425.0,0 years,1.0,Not In a Post-Season Tournament,Not In a Post-Season Tournament,,Not In a Post-Season Tournament,5
535,2024,SEC,72.7,12,73.9,8,122.2,7,118.9,5,102.9,109,106.4,217,15.6,18.0,17.1,71.0,57.152908,5,14.274418,26,28.894691,174,30.440901,246,17.052417,281,49.557522,219,33.390061,100,19.429658,154,48.060837,271,32.509506,92,73.9155,8,72.6918,12,118.948,5,122.18,7,106.36,217,102.887,109,19.2927,23,54.654655,32,40.875,1,77.195686,24,16.184519,1,49.413604,135,33.216783,135,71.468531,147,8.708709,133,37.523452,179,40.149743,284,54.881517,75,52.235551,226,0.10057,114,0.085565,92,,5.717877,11.0,5.798982,354.0,78.78,9.0,2.34,12.0,1.32,36.0,1.63,22.0,0.83,67.0,2.11,20.0,1.83,13.0,1.75941,196.0,35.27,89.0,14.26,332.0,15.65,318.0,25.12,29.0,22.61,92.0,22.34,129.0,41.21,94.0,30.82,98.0,17.07,174.0,5.03,342.0,5.83,278.0,25.52,186.0,24.9,126.0,16.55,273.0,13.42,311.0,19.59,25.0,19.3,534,Southeastern Conference (SEC),Kentucky,Mark Pope,Kentucky Wildcats,202425.0,0 years,1.0,3,South,Kentucky,March Madness,1
679,2020,SEC,67.7,215,68.5,212,112.7,24,108.0,37,95.1,52,96.2,70,17.4,183.0,17.7,251.0,50.69646,123,18.093201,122,30.832477,81,40.800929,18,24.273949,3,55.309926,51,20.416125,349,19.570522,131,49.097121,218,31.332357,153,68.4704,212,67.6714,215,108.022,37,112.665,24,96.1866,70,95.0609,52,17.6045,29,50.434783,126,34.279476,115,79.658606,1,13.636364,18,44.83066,25,30.703013,51,68.782161,75,7.747036,81,26.581544,350,38.317757,202,52.201258,166,46.722455,66,0.075739,290,0.072466,22,,10.84657,275.0,9.771264,180.0,78.04,31.0,1.73,31.0,1.93,11.0,-0.02,194.0,0.06,168.0,2.41,13.0,1.83,17.0,0.99,335.0,25.66,281.0,22.47,86.0,15.23,331.0,19.36,192.0,21.45,109.0,21.47,135.0,39.9,123.0,33.51,73.0,16.65,167.0,4.86,337.0,5.04,282.0,26.4,168.0,21.23,256.0,17.2,231.0,17.42,126.0,17.72,52.0,17.6,677,Southeastern Conference (SEC),Kentucky,Mark Pope,Kentucky Wildcats,202425.0,0 years,1.0,Not In a Post-Season Tournament,Not In a Post-Season Tournament,,Not In a Post-Season Tournament,5
705,2023,SEC,65.8,256,66.8,238,116.2,17,111.0,26,98.8,68,100.9,114,17.6,181.0,18.1,299.0,50.635697,178,17.485383,127,39.202658,1,31.93154,162,18.206951,183,56.556082,30,25.236967,323,18.611714,147,52.494577,121,28.893709,239,66.7883,238,65.7815,256,110.982,26,116.198,17,100.918,114,98.7991,68,17.3987,27,50.0,195,34.747145,137,70.597243,228,10.048232,103,48.633441,100,32.408759,86,74.350087,305,5.23743,2,29.97555,330,35.510627,127,55.113025,69,48.367594,114,0.095732,142,0.089613,156,,8.524124,140.0,6.994153,352.0,77.64,88.0,-0.26,192.0,1.83,18.0,0.58,103.0,1.1,61.0,-0.48,237.0,0.79,82.0,1.86,207.0,27.81,248.0,17.91,248.0,24.83,28.0,19.68,169.0,22.45,87.0,15.13,338.0,23.07,349.0,51.55,1.0,13.35,254.0,8.18,253.0,3.85,342.0,24.54,234.0,35.88,2.0,17.89,229.0,13.01,318.0,8.68,359.0,17.4,702,Southeastern Conference (SEC),Kentucky,Mark Pope,Kentucky Wildcats,202425.0,0 years,1.0,Not In a Post-Season Tournament,Not In a Post-Season Tournament,,Not In a Post-Season Tournament,5
1012,2021,SEC,68.1,181,70.0,132,108.0,84,99.7,198,93.1,35,99.4,113,17.1,151.0,17.2,147.0,47.086721,289,19.756581,217,32.644178,41,34.485095,90,21.067575,47,51.675185,145,27.25724,266,19.042189,123,52.90764,103,28.050171,265,70.0251,132,68.0633,181,99.738,198,108.047,84,99.3547,113,93.0778,35,14.9687,49,45.5,312,33.613445,172,72.888016,102,14.859438,4,46.586345,56,31.238095,49,74.05765,301,9.1,206,32.249322,292,34.516765,86,50.894309,196,49.044586,112,0.088876,191,0.087178,136,,11.038777,273.0,9.057458,249.0,78.95,5.0,2.18,18.0,1.95,14.0,2.13,6.0,1.61,34.0,2.31,12.0,2.06,14.0,1.0,335.0,31.72,158.0,24.02,52.0,16.46,282.0,22.76,73.0,17.29,273.0,19.43,195.0,34.0,206.0,25.17,211.0,25.12,35.0,7.37,276.0,8.29,157.0,26.28,152.0,24.5,130.0,21.83,85.0,14.46,243.0,12.9,234.0,14.9,1013,Southeastern Conference (SEC),Kentucky,Mark Pope,Kentucky Wildcats,202425.0,0 years,1.0,Not In a Post-Season Tournament,Not In a Post-Season Tournament,,Not In a Post-Season Tournament,5


# Bringing in Ollama to figure out how to shape this model

In [42]:
def create_analysis_prompt(df_slice):
    # First get basic statistics and info about the data
    column_info = df_slice.dtypes.to_dict()
    sample_size = len(df_slice)
    
    prompt = f"""
    Analyze this college basketball dataset for Kentucky (2020-present) to recommend a machine learning model structure.
    
    Dataset Overview:
    - Number of records: {sample_size}
    - Features available: {', '.join(df_slice.columns.tolist())}
    
    Data Sample:
    {df_slice.head(3).to_string()}
    
    Column Data Types:
    {pd.Series(column_info).to_string()}
    
    Statistical Summary:
    {df_slice.describe().to_string()}
    
    Based on this data:
    1. What type of machine learning model would be most appropriate?
    2. Which features appear most relevant for prediction?
    3. What data preprocessing steps would you recommend?
    4. Are there any potential issues with the data that need addressing?
    5. What evaluation metrics would be most appropriate for this basketball data?
    """
    
    return prompt

# Generate the prompt with your data slice
analysis_prompt = create_analysis_prompt(march_madness[(march_madness['Mapped ESPN Team Name']=='Kentucky') & 
                                                      (march_madness['Season'] >=2020)])


print(analysis_prompt)


    Analyze this college basketball dataset for Kentucky (2020-present) to recommend a machine learning model structure.
    
    Dataset Overview:
    - Number of records: 6
    - Features available: Season, Short Conference Name, Adjusted Temo, Adjusted Tempo Rank, Raw Tempo, Raw Tempo Rank, Adjusted Offensive Efficiency, Adjusted Offensive Efficiency Rank, Raw Offensive Efficiency, Raw Offensive Efficiency Rank, Adjusted Defensive Efficiency, Adjusted Defensive Efficiency Rank, Raw Defensive Efficiency, Raw Defensive Efficiency Rank, Avg Possession Length (Offense), Avg Possession Length (Offense) Rank, Avg Possession Length (Defense), Avg Possession Length (Defense) Rank, eFGPct, RankeFGPct, TOPct, RankTOPct, ORPct, RankORPct, FTRate, RankFTRate, OffFT, RankOffFT, Off2PtFG, RankOff2PtFG, Off3PtFG, RankOff3PtFG, DefFT, RankDefFT, Def2PtFG, RankDef2PtFG, Def3PtFG, RankDef3PtFG, Tempo, RankTempo, AdjTempo, RankAdjTempo, OE, RankOE, AdjOE, RankAdjOE, DE, RankDE, AdjDE, RankAdjDE, AdjE

In [44]:
stream_function(analysis_prompt)

Here are the answers to your questions:

**1. What type of machine learning model would be most appropriate?**

Based on the provided data, a logistic regression or decision tree model seems like a good fit. The data appears to be a classification problem with multiple features and target variable (number of points scored in a game). Logistic regression is suitable for multi-class classification problems, while decision trees can handle both categorical and continuous features.

**2. Which features appear most relevant for prediction?**

After examining the feature names, I would say that the following three features are most relevant:

* `points`: This feature has a strong relationship with the target variable (number of points scored in a game).
* `turns`: This feature also appears to be related to the target variable, although its significance is not as clear.
* `fg%`: This feature suggests that shot percentage might be an important factor in predicting scores.

**3. What data prepr