In [20]:
from pathlib import Path
import numpy as np
import pandas as pd

import re

# FRC Raw Data Processing
The raw data downloaded from The Blue Alliance's repository contains
information from all matches that have been played up to the 2019 season.
This data must be transformed to allow the SciKit-learn libraries to understand them.
This will be done using the Pandas Python library.

In [21]:
from main import filter_qual_matches
import math

match_table_header = ["event_and_match_key", "red1", "red2", "red3", "blue1", "blue2", "blue3", "red_score", "blue_score"]
all_event_years_folder = Path('../data/tba_raw_data/events')

processed_data_dtypes = np.dtype([
    ('red1_stdscr_won_event', float),
    ('red2_stdscr_won_event', float),
    ('red3_stdscr_won_event', float),
    ('blue1_stdscr_won_event', float),
    ('blue2_stdscr_won_event', float),
    ('blue3_stdscr_won_event', float),
    ('red1_number', int),
    ('red2_number', int),
    ('red3_number', int),
    ('blue1_number', int),
    ('blue2_number', int),
    ('blue3_number', int),
    ('red1_stdscr_stdscr_win_diff', float),
    ('red2_stdscr_stdscr_win_diff', float),
    ('red3_stdscr_stdscr_win_diff', float),
    ('blue1_stdscr_stdscr_win_diff', float),
    ('blue2_stdscr_stdscr_win_diff', float),
    ('blue3_stdscr_stdscr_win_diff', float),
    ('red1_stdscr_stdscr_lose_diff', float),
    ('red2_stdscr_stdscr_lose_diff', float),
    ('red3_stdscr_stdscr_lose_diff', float),
    ('blue1_stdscr_stdscr_lose_diff', float),
    ('blue2_stdscr_stdscr_lose_diff', float),
    ('blue3_stdscr_stdscr_lose_diff', float),
])

matches_won_by_red = pd.Series(dtype=bool)

data_layout = np.empty(0, dtype=processed_data_dtypes)

all_matches = pd.DataFrame(data_layout)

for year in range(2005, 2019, 1):
    year_folder = all_event_years_folder / str(year)
    for event_folder in year_folder.iterdir():
        event_match_data_file = event_folder / "{}_matches.csv".format(event_folder.name)

        if (not event_match_data_file.exists()): continue

        event_match_data = pd.read_csv(event_match_data_file, names=match_table_header)

        ordered_match_table = filter_qual_matches(event_match_data)

        team_prt_stats = {}

        for match in ordered_match_table.iterrows():
            c_row = match[1]
            previous_matches = ordered_match_table.iloc[:match[0],:]

            current_teams = [c_row["red1"], c_row["red2"], c_row["red3"], c_row["blue1"], c_row["blue2"], c_row["blue3"]]
            score_differences = []
            positive_team_score_diff = {}
            negative_team_score_diff = {}
            matches_won_by_team = {}

            for match2 in previous_matches.iterrows():
                row = match2[1]

                red_won = row["red_score"] > row["blue_score"]
                difference = abs(row["red_score"] - row["blue_score"])

                score_differences.append(abs(match2[1]["red_score"] - match2[1]["blue_score"]))

                for team in ["red1", "red2", "red3"]:
                    if row[team] not in positive_team_score_diff: positive_team_score_diff[row[team]] = []
                    if row[team] not in negative_team_score_diff: negative_team_score_diff[row[team]] = []
                    if row[team] not in matches_won_by_team: matches_won_by_team[row[team]] = 0
                    if red_won:
                        matches_won_by_team[row[team]] += 1
                        positive_team_score_diff[row[team]].append(difference)
                    else:
                        negative_team_score_diff[row[team]].append(difference)

                for team in ["blue1", "blue2", "blue3"]:
                    if row[team] not in positive_team_score_diff: positive_team_score_diff[row[team]] = []
                    if row[team] not in negative_team_score_diff: negative_team_score_diff[row[team]] = []
                    if row[team] not in matches_won_by_team: matches_won_by_team[row[team]] = 0
                    if red_won:
                        negative_team_score_diff[row[team]].append(difference)
                    else:
                        matches_won_by_team[row[team]] += 1
                        positive_team_score_diff[row[team]].append(difference)

            new_match_processed_row = []

            # zscore of matches won by team
            std = np.std(list(matches_won_by_team.values()))
            avg = np.average(list(matches_won_by_team.values()))
            try:
                for team in current_teams:
                    new_match_processed_row.append((matches_won_by_team[team] - avg) / std)
                # team numbers
                for team in current_teams:
                    team_number_pattern = re.compile("frc(\d*)")
                    matched = team_number_pattern.match(team)
                    new_match_processed_row.append(int(matched.group(1)))
            except:
                continue

            # zscore of differences
            diff_std = np.std(score_differences)
            diff_avg = np.average(score_differences)
            for team in current_teams:
                if len(positive_team_score_diff[team]) == 0:
                    new_match_processed_row.append(0)
                    continue
                team_diff_avg = np.average(positive_team_score_diff[team])
                team_diff_avg = 0 if math.isnan(team_diff_avg) else team_diff_avg
                new_match_processed_row.append(0 if diff_std == 0 else (team_diff_avg - diff_avg) / diff_std)
            for team in current_teams:
                if len(negative_team_score_diff[team]) == 0:
                    new_match_processed_row.append(0)
                    continue
                team_diff_avg = np.average(negative_team_score_diff[team])
                team_diff_avg = 0 if math.isnan(team_diff_avg) else team_diff_avg
                new_match_processed_row.append(0 if diff_std == 0 else (team_diff_avg - diff_avg) / diff_std)

            current_index = all_matches.index.max()
            current_index = current_index if not math.isnan(current_index) else -1
            all_matches.loc[current_index + 1] = new_match_processed_row
            matches_won_by_red.loc[current_index + 1] = c_row["red_score"] > c_row["blue_score"]
        print("{} finished".format(event_folder))
    print("{} finished".format(year))
print("Finished processing")
all_matches.to_csv(Path("../data/all_match_data.csv"))
matches_won_by_red.to_csv(Path("../data/all_match_result_data.csv"))


../data/tba_raw_data/events/2005/2005on finished
../data/tba_raw_data/events/2005/2005nj finished
../data/tba_raw_data/events/2005/2005ga finished
../data/tba_raw_data/events/2005/2005glr finished
../data/tba_raw_data/events/2005/2005cur finished
../data/tba_raw_data/events/2005/2005cmp finished
../data/tba_raw_data/events/2005/2005il finished
../data/tba_raw_data/events/2005/2005oh finished
../data/tba_raw_data/events/2005/2005new finished
../data/tba_raw_data/events/2005/2005wat finished
../data/tba_raw_data/events/2005/2005ny finished
../data/tba_raw_data/events/2005/2005co finished
../data/tba_raw_data/events/2005/2005pa finished
../data/tba_raw_data/events/2005/2005sc finished
../data/tba_raw_data/events/2005/2005wa finished
../data/tba_raw_data/events/2005/2005gal finished
../data/tba_raw_data/events/2005/2005wpi finished
../data/tba_raw_data/events/2005/2005arc finished
../data/tba_raw_data/events/2005/2005mi finished
../data/tba_raw_data/events/2005/2005tx finished
../data/tba_