In [1]:
import requests
import json
import re
import numpy as np
import pandas as pd

sex = "Men "
course = "Yard "

In [2]:
def separate_events(year):
    new_list = []
    for event in year:
        if "Time Trial" not in event:
            if "Relay" not in event:
                if "Swim-off" not in event:
                    if "Points" in event:
                        event = event.replace("\\\n", "\n ")
                        new_list.append(event)
    return new_list


In [3]:
with open('yr2009.rtf', 'r') as myfile:
    yr2009 = myfile.read()
    yr2009 = yr2009.split("Event ")
    
yr = separate_events(yr2009)
yr[1]



In [4]:
def split_event_by_line(string):
    if "Points" in string:
        string = string.split("Points")[1]
        string = string.replace("\r","")
        if "Bradley\n" in string:
            string = string.replace("Bradley\n", "Bradley")
        list_of_strings = string.split("\n")
        return list_of_strings

fr50 = split_event_by_line(yr[3])
fr50

[' ',
 ' === A - Final === ',
 ' 1 Dickerson, Daniel JR Princeton 295.70 317.85A 32 ',
 ' 2 Kambe, C.J. JR Brown 284.50 290.95 28 ',
 ' 3 Cragg, Jeff FR Pennsylvania 276.80 286.60 27 ',
 ' 4 Sheppard, Michael SO Cornell 252.85 276.50 26 ',
 ' 5 Donohoe, Chris SR Cornell 269.90 262.00 25 ',
 ' 6 Fraiman, Yarden SR Princeton 257.00 259.90 24 ',
 ' 7 Staab, Colton FR Yale 268.65 257.60 23 ',
 ' 8 Teer, Drew JR Yale 259.25 248.70 22 ',
 ' === B - Final === ',
 ' 9 Ranta, Zac SO Harvard 242.15 283.15 20 ',
 ' 10 Plante, Chris FR Dartmouth 238.55 260.55 17 ',
 ' 11 Levkoff, David SO Columbia 229.85 255.05 16 ',
 ' 12 Olson, Eric SO Yale 239.75 249.95 15 ',
 ' 13 DeMarle, Pat SO Cornell 228.70 244.10 14 ',
 ' 14 Feldman, Jonathan FR Brown 228.50 243.25 13 ',
 ' 15 Brown, Mikey SO Dartmouth 230.80 235.80 12 ',
 ' 16 Wells, Tom FR Princeton 239.60 219.90 11 ',
 ' === Preliminaries === ',
 ' 17 Winslow, Henry SO Harvard 219.70 9 ',
 ' 18 Hull, Jeff SR Columbia 204.25 7 ',
 ' 19 Marple, Alex FR P

In [8]:
def extract_rank_from_line(line):
    rank1 = re.compile(r'\ \d \ ')
    rank2 = re.compile(r'\ \d\d \ ')
    if line[:3]:
        try:
            return int(line[:3])
        except ValueError:
            return 0
    else:
        return 0

def extract_name_from_line(line):
    comma = re.search(r'\,', line).start()
    last_name = line[comma::-1].split(" ")[0][::-1]
    first_name = line[comma+2:].split(" ")[0]
    name = last_name + " " + first_name
    return name

def extract_year_from_line(line):
    years = [" FR ", " SO ", " JR ", " SR "]
    for year in years:
        if year in line:
            return year.replace(" ", "")
        if not any(yr in line for yr in years):
            ix = re.search(r'\d', line[6:]).start()
            return line[ix+6:ix+8]

def extract_school_from_line(line):
    year = extract_year_from_line(line) + " "
    school = line.split(year)[1]
    school = school.split(" ")[0]
    return school

def extract_times_from_line(line):
    time1 = re.compile(r'\d*\:\d\d\.\d\d')
    time2 = re.compile(r'\d\d\d\.\d\d')
    time3 = re.compile(r'\d\d\.\d\d')
    line_ix = [(m.start(0), m.end(0)) for m in re.finditer(time1, line)]
    if len(line_ix) == 0:
        line_ix = [(m.start(0), m.end(0)) for m in re.finditer(time2, line)]
        if len(line_ix) == 0:
            line_ix = [(m.start(0), m.end(0)) for m in re.finditer(time3, line)]        
    if len(line_ix) == 2:
        p_start = line_ix[0][0]
        p_end = line_ix[0][1]
        f_start = line_ix[1][0]
        f_end = line_ix[1][1]
        prelims = line[p_start:p_end]
        finals = line[f_start:f_end]
    elif len(line_ix) == 1:
        f_start = line_ix[0][0]
        f_end = line_ix[0][1]
        finals = line[f_start:f_end]
        prelims = "N/A"
    return [prelims, finals]

def extract_points_from_line(line):
    try: 
        return float(line[-3:])
    except ValueError:
        return 0

def extract_data_from_line(line):
    rank = extract_rank_from_line(line)
    if rank:
        name = extract_name_from_line(line)
        year = extract_year_from_line(line)
        school = extract_school_from_line(line)
        prelim = extract_times_from_line(line)[0]
        final = extract_times_from_line(line)[1]
        points = extract_points_from_line(line)
        return [rank, name, year, school, prelim, final, points]

def extract_all_data_from_event(event):
    list_of_lines = split_event_by_line(event)
    nlist = []
    for line in list_of_lines:
        if line:
            d = extract_data_from_line(line)
            if d:
                nlist.append(d)
    return nlist

extracted_event = extract_all_data_from_event(yr[3])
extracted_event

[[1, 'Dickerson, Daniel', 'JR', 'Princeton', '295.70', '317.85', 32.0],
 [2, 'Kambe, C.J.', 'JR', 'Brown', '284.50', '290.95', 28.0],
 [3, 'Cragg, Jeff', 'FR', 'Pennsylvania', '276.80', '286.60', 27.0],
 [4, 'Sheppard, Michael', 'SO', 'Cornell', '252.85', '276.50', 26.0],
 [5, 'Donohoe, Chris', 'SR', 'Cornell', '269.90', '262.00', 25.0],
 [6, 'Fraiman, Yarden', 'SR', 'Princeton', '257.00', '259.90', 24.0],
 [7, 'Staab, Colton', 'FR', 'Yale', '268.65', '257.60', 23.0],
 [8, 'Teer, Drew', 'JR', 'Yale', '259.25', '248.70', 22.0],
 [9, 'Ranta, Zac', 'SO', 'Harvard', '242.15', '283.15', 20.0],
 [10, 'Plante, Chris', 'FR', 'Dartmouth', '238.55', '260.55', 17.0],
 [11, 'Levkoff, David', 'SO', 'Columbia', '229.85', '255.05', 16.0],
 [12, 'Olson, Eric', 'SO', 'Yale', '239.75', '249.95', 15.0],
 [13, 'DeMarle, Pat', 'SO', 'Cornell', '228.70', '244.10', 14.0],
 [14, 'Feldman, Jonathan', 'FR', 'Brown', '228.50', '243.25', 13.0],
 [15, 'Brown, Mikey', 'SO', 'Dartmouth', '230.80', '235.80', 12.0],
 

In [9]:
def get_event_name_num(event):
    if "Diving" not in event:
        try:
            event_num = int(event[:2])
        except ValueError:
            event_num = "NaN"
        dist = int(event.split(sex)[1].split(" ")[0])
        stroke = event.split(course)[1].split("=")[0]
        if (type(event_num) == int) and event_num < 25:
            return event_num, dist, stroke
    else:
        try:
            event_num = int(event[:2])
        except ValueError:
            event_num = "NaN"
        dist = int(event.split(sex)[1].split(" ")[0])
        stroke = "Diving"
        if (type(event_num) == int) and event_num < 25:
            return event_num, dist, stroke

get_event_name_num(yr[2])

(4, 50, 'Freestyle')

In [10]:
def pandize_meet(year):
    meet_results = pd.DataFrame(columns = ["Ev#", "Distance", "Stroke", "Rank", "Name", "Yr", "School", "Prelim", "Final", "Points"])

    events_list = separate_events(year)    
    for event in events_list:
        event_name_num = get_event_name_num(event)
        print(event_name_num)
        event_data = extract_all_data_from_event(event)
        pandized_event = pd.DataFrame(event_data)
        pandized_event.columns = ["Rank", "Name", "Yr", "School", "Prelim", "Final", "Points"]
        pandized_event["Ev#"] = event_name_num[0]
        pandized_event["Distance"] = event_name_num[1]
        pandized_event["Stroke"] = event_name_num[2]
        meet_results = meet_results.append(pandized_event)
    return meet_results

pandize_meet(yr2009)

(2, 500, 'Freestyle')
(3, 200, 'IM')
(4, 50, 'Freestyle')
(5, 1, 'Diving')
(8, 1000, 'Freestyle\n ')
(9, 400, 'IM\n ')
(10, 100, 'Butterfly\n ')
(11, 200, 'Freestyle\n ')
(12, 100, 'Breaststroke\n ')
(13, 100, 'Backstroke\n ')
(15, 1650, 'Freestyle\n ')
(16, 200, 'Backstroke\n ')
(17, 100, 'Freestyle\n ')
(18, 200, 'Breaststroke\n ')
(19, 200, 'Butterfly\n ')
(20, 3, 'Diving')


Unnamed: 0,Distance,Ev#,Final,Name,Points,Prelim,Rank,School,Stroke,Yr
0,500,2,4:18.04,"McNamara, Travis",32,4:20.93,1,Princeton,Freestyle,FR
1,500,2,4:19.26,"Newman, Wes",28,4:21.15,2,Cornell,Freestyle,SR
2,500,2,4:19.91,"Lynch, Eric",27,4:21.27,3,Harvard,Freestyle,SR
3,500,2,4:20.19,"Biggs, Patrick",26,4:22.33,4,Princeton,Freestyle,SO
4,500,2,4:22.82,"Meyer, Alex",25,4:22.25,5,Harvard,Freestyle,JR
5,500,2,4:25.33,"Lewkowitz, Blake",24,4:22.44,6,Harvard,Freestyle,SO
6,500,2,4:25.68,"Griest, Robert",23,4:24.06,7,Princeton,Freestyle,SR
7,500,2,4:25.96,"Eckel, Dan",22,4:23.09,8,Princeton,Freestyle,SR
8,500,2,4:22.69,"Fee, James",20,4:25.48,9,Pennsylvania,Freestyle,JR
9,500,2,4:23.88,"Hanna, Colin",17,4:25.16,10,Princeton,Freestyle,SO


In [None]:
get_event_name_num(yr[3])

In [13]:
rank1 = re.compile(r'\d')
rank1 = " " + rank1 + " "
rank1

TypeError: Can't convert '_sre.SRE_Pattern' object to str implicitly