## Import Modules and Read Data

In [1]:
import pandas
from collections import defaultdict

stats_dataset_path = "../data/player_stats_season.csv"
players_dataset_path = "../data/players.csv"
stats_df = pandas.read_csv(stats_dataset_path)
players_df = pandas.read_csv(players_dataset_path)

node_list_output_path = "../data/gephi-node-list.csv"
edge_list_output_path = "../data/gephi-edge-list.csv"

print(stats_df.columns, "\n")
print(players_df.columns)

Index(['season', 'season_type', 'player_id', 'player_name',
       'player_display_name', 'position', 'position_group', 'headshot_url',
       'games', 'recent_team', 'completions', 'attempts', 'passing_yards',
       'passing_tds', 'interceptions', 'sacks', 'sack_yards', 'sack_fumbles',
       'sack_fumbles_lost', 'passing_air_yards', 'passing_yards_after_catch',
       'passing_first_downs', 'passing_epa', 'passing_2pt_conversions', 'pacr',
       'dakota', 'carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles',
       'rushing_fumbles_lost', 'rushing_first_downs', 'rushing_epa',
       'rushing_2pt_conversions', 'receptions', 'targets', 'receiving_yards',
       'receiving_tds', 'receiving_fumbles', 'receiving_fumbles_lost',
       'receiving_air_yards', 'receiving_yards_after_catch',
       'receiving_first_downs', 'receiving_epa', 'receiving_2pt_conversions',
       'racr', 'target_share', 'air_yards_share', 'wopr', 'special_teams_tds',
       'fantasy_points', 'fantasy_poin

## Clean Data

In [2]:
stats_df = stats_df[stats_df["season_type"] == "REG+POST"]

In [3]:
colleges_df = players_df[["gsis_id", "college_name", "college_conference"]]
colleges_df = colleges_df.rename(columns={"gsis_id": "player_id"})

In [4]:
merged_df = pandas.merge(stats_df, colleges_df, on="player_id")

In [5]:
columns = ["player_id", "player_display_name", "season", "recent_team", "position", 
           "college_name", "college_conference", "fantasy_points"]

data = merged_df[columns]
data = data.dropna()
print("Number of players:", data["player_id"].nunique())

Number of players: 1962


## Create Node List

In [6]:
node_list_df = data.drop_duplicates(subset="player_id", keep="last")

In [7]:
total_fantasy_points = defaultdict(int)
for index, row in data.iterrows():
    total_fantasy_points[row.player_id] += row.fantasy_points
    
for key, value in total_fantasy_points.items():
    index = node_list_df[node_list_df["player_id"] == key].index
    node_list_df.loc[index, "fantasy_points"] = int(value)

In [8]:
node_list_df = node_list_df.drop("season", axis=1)
node_list_df = node_list_df.rename(columns={"player_id": "ID", "player_display_name": "name", "recent_team": "team"})
node_list_df

Unnamed: 0,ID,name,team,position,college_name,college_conference,fantasy_points
2949,00-0016919,Adam Vinatieri,NE,K,South Dakota State,Missouri Valley Football Conference,4.0
3746,00-0022531,Jason Peters,BUF,T,Arkansas,Southeastern Conference,12.0
4177,00-0022045,Terence Newman,DAL,CB,Kansas State,Big Twelve Conference,6.0
4266,00-0022923,DeAngelo Hall,ATL,FS,Virginia Tech,Atlantic Coast Conference,0.0
4755,00-0023252,Robbie Gould,CHI,K,Penn State,Big Ten Conference,0.0
...,...,...,...,...,...,...,...
15166,00-0039917,Michael Penix,ATL,QB,Washington,Pacific Twelve Conference,44.0
15167,00-0039918,Caleb Williams,CHI,QB,Southern California,Pacific Twelve Conference,254.0
15168,00-0039919,Rome Odunze,CHI,WR,Washington,Pacific Twelve Conference,90.0
15169,00-0039920,Malachi Corley,NYJ,WR,Western Kentucky,Conference USA,4.0


In [9]:
node_list_df.to_csv(node_list_output_path, index=False)

## Create Edge list

In [10]:
teams = defaultdict(list)
for index, row in data.iterrows():
    team = (row.recent_team, row.season)
    teams[team].append(row.player_id)

In [11]:
edges = defaultdict(int)
for team, players in teams.items():
    players.sort()
    for i in range(len(players)):
        for j in range(i+1, len(players)):
            edge = (players[i], players[j])
            edges[edge] += 1

In [12]:
edge_list = []
for edge, weight in edges.items():
    edge_list.append([edge[0], edge[1], weight])
edge_list_df = pandas.DataFrame(edge_list, columns=["Source", "Target", "Weight"])
edge_list_df

Unnamed: 0,Source,Target,Weight
0,00-0020531,00-0021547,3
1,00-0021206,00-0022084,3
2,00-0016919,00-0019596,1
3,00-0016919,00-0022943,1
4,00-0019596,00-0022943,7
...,...,...,...
42753,00-0038797,00-0039811,1
42754,00-0038809,00-0039811,1
42755,00-0038996,00-0039811,1
42756,00-0039144,00-0039811,1


In [13]:
edge_list_df.to_csv(edge_list_output_path, index=False)