In [1]:
# !pip install psycopg2 psycopg2-binary sqlalchemy
import pandas as pd
import psycopg2 as psql
from sqlalchemy import create_engine

In [2]:
conn_string = "postgresql+psycopg2://postgres:Northeastern2025!@127.0.0.1/group1_nba"
db = create_engine(conn_string)
conn = db.connect()
connection = psql.connect(
    dbname="group1_nba",
    user="postgres",
    password="Northeastern2025!",
    host="127.0.0.1",
    port="5432"
)
connection.autocommit = True

# Create a cursor object
cursor = connection.cursor()

### Teams Table

In [3]:
df = pd.read_csv("./Data/teams.csv")
df.rename(columns={
    "id": "team_id",
    "nickname": "team_name", 
    "abbreviation": "team_abbr",
    "Conference": "conference",
    "Division": "division"
}, inplace=True)
df = df[["team_abbr", "team_id", "team_name", "city", "state", 
         "year_founded", "conference", "division"]]
df.head()

Unnamed: 0,team_abbr,team_id,team_name,city,state,year_founded,conference,division
0,ATL,1610612737,Hawks,Atlanta,Georgia,1949,East,Southeast
1,BOS,1610612738,Celtics,Boston,Massachusetts,1946,East,Atlantic
2,CLE,1610612739,Cavaliers,Cleveland,Ohio,1970,East,Central
3,NOP,1610612740,Pelicans,New Orleans,Louisiana,2002,West,Southwest
4,CHI,1610612741,Bulls,Chicago,Illinois,1966,East,Central


In [4]:
df.shape

(30, 8)

In [5]:
teams_query = """
DROP TABLE IF EXISTS teams;
CREATE TABLE teams (
    team_abbr CHAR(3) NOT NULL PRIMARY KEY,
    team_id CHAR(10),
    team_name VARCHAR(20),
    city VARCHAR(20),
    state VARCHAR(20),
    year_founded INT,
    conference VARCHAR(20),
    division VARCHAR(20)
);
"""
cursor.execute(teams_query)
_ = df.to_sql("teams", db, if_exists="replace", index=False)

### Games Table

In [6]:
df = pd.read_csv("./Data/games.csv")
df.rename(columns={
    "GAME_ID": "game_id",
    "SEASON_ID": "season_id",
    "GAME_DATE": "date",
    "HOME_POINTS": "home_points", 
    "AWAY_POINTS": "away_points", 
    "IS_PLAYOFFS": "is_playoff",
    "HOME_TEAM_ABBR": "home_team_abbr",
    "AWAY_TEAM_ABBR": "away_team_abbr"
}, inplace=True)
df = df[["game_id", "season_id", "date", "home_points", "away_points", 
         "is_playoff", "home_team_abbr", "away_team_abbr"]]
df.head()

Unnamed: 0,game_id,season_id,date,home_points,away_points,is_playoff,home_team_abbr,away_team_abbr
0,22401192,2024,2025-04-13,126.0,133,False,MIL,DET
1,22401197,2024,2025-04-13,111.0,118,False,SAS,TOR
2,22401186,2024,2025-04-13,93.0,105,False,ATL,ORL
3,22401194,2024,2025-04-13,62.0,97,False,MEM,DAL
4,22401200,2024,2025-04-13,87.0,98,False,SAC,PHX


In [7]:
df.shape

(6427, 8)

In [8]:
games_query = """
DROP TABLE IF EXISTS games;
CREATE TABLE games (
    game_id CHAR(8) NOT NULL PRIMARY KEY,
    season_id CHAR(4),
    date DATE,
    home_points FLOAT,
    away_points FLOAT,
    is_playoff BOOL,
    home_team_abbr CHAR(3),
    away_team_abbr CHAR(3)
);
"""
cursor.execute(games_query)
_ = df.to_sql("games", db, if_exists="replace", index=False)

### Players Table

In [9]:
df = pd.read_csv("./Data/players.csv")
df.rename(columns={
    "PLAYER_ID": "player_id",
    "TeamID": "team_id",
    "SEASON": "season_id", 
    "PLAYER": "name",
    "POSITION": "position",
    "BIRTH_DATE": "date_of_birth",
    "HEIGHT": "height", 
    "WEIGHT": "weight"
}, inplace=True)
df["height"] = df.height.map(lambda x: int(x.split("-")[0]) * 12 + int(x.split("-")[1]))
df = df[["player_id", "team_id", "season_id", "name", "position", 
         "date_of_birth", "height", "weight"]]
df.head()

Unnamed: 0,player_id,team_id,season_id,name,position,date_of_birth,height,weight
0,1629164,1610612737,2020,Brandon Goodwin,G,1995-10-02,72,180.0
1,1630233,1610612737,2020,Nathan Knight,F-C,1997-09-20,82,253.0
2,1628989,1610612737,2020,Kevin Huerter,G-F,1998-08-27,79,190.0
3,1630219,1610612737,2020,Skylar Mays,G,1997-09-05,76,205.0
4,101150,1610612737,2020,Lou Williams,G,1986-10-27,73,175.0


In [10]:
players = df[["player_id", "name", "position", "date_of_birth", "height", "weight"]]\
    .drop_duplicates(subset="player_id", keep="last")
players.head()

Unnamed: 0,player_id,name,position,date_of_birth,height,weight
11,203524,Solomon Hill,F,1991-03-18,78,226.0
19,1630536,Sharife Cooper,G,2001-06-11,73,176.0
23,101150,Lou Williams,G,1986-10-27,74,175.0
33,1630602,Chaundee Brown Jr.,F,1998-12-04,77,215.0
34,1631495,Donovan Williams,G,2001-09-06,78,190.0


In [11]:
players.shape

(885, 6)

In [12]:
players_query = """
DROP TABLE IF EXISTS players;
CREATE TABLE players (
    player_id VARCHAR(8) NOT NULL PRIMARY KEY,
    name VARCHAR(60),
    position VARCHAR(5),
    date_of_birth DATE,
    height INT,
    weight FLOAT
);
"""
cursor.execute(players_query)
_ = players.to_sql("players", db, if_exists="replace", index=False)

### Contracts Table

In [13]:
salaries = pd.read_csv("./Data/player_salaries.csv")
salaries = df.merge(salaries, left_on=["player_id", "season_id"], 
                    right_on=["PLAYER_ID", "SEASON"])\
            .merge(pd.read_csv("./Data/teams.csv"),
                    left_on="team_id", right_on="id")[[
                        "player_id", "abbreviation", "season_id", "Salary"
]].rename(columns={"Salary": "salary", "abbreviation": "team_abbr"})
salaries.head()

Unnamed: 0,player_id,team_abbr,season_id,salary
0,1629164,ATL,2020,1701593
1,1628989,ATL,2020,2761920
2,101150,ATL,2020,8000000
3,201568,ATL,2020,19500000
4,1629027,ATL,2020,6571800


In [14]:
salaries.shape

(2181, 4)

In [15]:
salaries_query = """
DROP TABLE IF EXISTS contracts;
CREATE TABLE contracts (
    player_id VARCHAR(8) NOT NULL,
    team_abbr CHAR(3) NOT NULL,
    season_id INT NOT NULL,
    salary INT
);
"""
cursor.execute(salaries_query)
_ = salaries.to_sql("contracts", db, if_exists="replace", index=False)

### Player Performance Table

In [16]:
df = pd.read_csv("./Data/players_game_stats.csv")
df = df.rename(columns={
    "PLAYER_ID": "player_id", 
    "GAME_ID": "game_id", 
    "MIN": "minutes",
    "PTS": "points", 
    "FGM": "field_goals_made",
    "FG_PCT": "field_goal_pct",
    "FG3M": "three_pointers_made",
    "FG3_PCT": "three_pointers_pct",
    "FTM": "free_throws_made",
    "FT_PCT": "free_throws_pct",
    "REB": "rebounds",
    "AST": "assists", 
    "STL": "steals", 
    "BLK": "blocks",
    "TOV": "turnovers"
})[[
    "player_id", "game_id", "minutes", "points", 
    "field_goals_made", "field_goal_pct", "three_pointers_made",
    "three_pointers_pct", "free_throws_made", "free_throws_pct",
    "rebounds", "assists", "steals", "blocks", "turnovers"
]]
df["player_id"] = df.player_id.astype("str").str.rstrip(".0")
df.head()

Unnamed: 0,player_id,game_id,minutes,points,field_goals_made,field_goal_pct,three_pointers_made,three_pointers_pct,free_throws_made,free_throws_pct,rebounds,assists,steals,blocks,turnovers
0,162899,22001080,10,0,0,0.0,0,0.0,0,,0,0,0,0,0
1,1629023,22001080,30,11,4,0.4,3,0.333,0,,9,4,2,1,2
2,1630173,22001069,42,23,10,0.625,0,,3,0.429,10,0,2,1,2
3,203932,22001076,12,2,1,0.25,0,0.0,0,,2,0,0,0,0
4,1626181,22001076,33,19,5,0.556,3,0.6,6,1.0,3,1,0,0,2


In [17]:
df.shape

(136668, 15)

In [18]:
player_performances_query = """
DROP TABLE IF EXISTS player_performances;
CREATE TABLE player_performances (
    player_id VARCHAR(8) NOT NULL,
    game_id CHAR(8) NOT NULL,
    minutes INT,
    points INT,
    field_goals_made INT,
    field_goals_pct FLOAT,
    three_pointers_made INT,
    three_pointers_pct FLOAT,
    free_throws_made INT,
    free_throws_pct FLOAT,
    rebounds INT,
    assists INT,
    blocks INT,
    turnovers INT
);
"""
cursor.execute(player_performances_query)
_ = df.to_sql("player_performances", db, if_exists="replace", index=False)

### Team Outcomes Table

In [19]:
df = pd.read_csv("./Data/team_season_stats.csv")
df = df.merge(pd.read_csv("./Data/teams.csv"), 
              left_on="team_id", right_on="id")[[
                  "abbreviation", "season_id", "wins", "losses", "made_playoffs"
              ]].rename(columns={"abbreviation": "team_abbr"})
df.head()


Unnamed: 0,team_abbr,season_id,wins,losses,made_playoffs
0,ATL,2020,41,31,True
1,BOS,2020,36,36,True
2,CLE,2020,22,50,False
3,NOP,2020,31,41,False
4,CHI,2020,31,41,False


In [20]:
df.shape

(150, 5)

In [21]:
team_outcomes_query = """
DROP TABLE IF EXISTS teams_outcomes;
CREATE TABLE teams_outcomes (
    team_abbr CHAR(3) NOT NULL,
    wins INT,
    losses INT,
    made_playoffs BOOL
);
"""
cursor.execute(team_outcomes_query)
_ = df.to_sql("teams_outcomes", db, if_exists="replace", index=False)

### Seasons

In [22]:
df = pd.DataFrame({
    "season_id": [2020, 2021, 2022, 2023, 2024],
    "season_start": [2020, 2021, 2022, 2023, 2024],
    "season_end": [2021, 2022, 2023, 2024, 2025]
})
df

Unnamed: 0,season_id,season_start,season_end
0,2020,2020,2021
1,2021,2021,2022
2,2022,2022,2023
3,2023,2023,2024
4,2024,2024,2025


In [23]:
df.shape

(5, 3)

In [24]:
seasons_query = """
DROP TABLE IF EXISTS seasons;
CREATE TABLE seasons (
    season_id INT NOT NULL,
    season_start INT,
    season_end INT
);
"""
cursor.execute(seasons_query)
_ = df.to_sql("seasons", db, if_exists="replace", index=False)