In [2]:
import pandas as pd
import numpy as np
import os
import json
import time
import datetime
from dateutil.relativedelta import relativedelta
from chessratings import uscf_elo


In [3]:
all_wine_reviews = []
for f in os.listdir('raw_data'):
    filename = 'raw_data/' + f
    with open(filename, 'r') as contents:
        user_wine_reviews = json.loads(contents.read())
        user_id = f.split('.')[0]
        scrape_date_unix = os.path.getmtime('raw_data/4057966.json')
        scrape_date = datetime.datetime.fromtimestamp(scrape_date_unix)
        for u in user_wine_reviews:
            u['reviewer'] = user_id
            u['scrape_date'] = scrape_date

            all_wine_reviews.append(u)

wine_review_df = pd.json_normalize(all_wine_reviews)

In [4]:
def nearest(items, pivot):
    return min(items, key=lambda x: abs(x - pivot))


def compute_date(scrape_date, review_date, review_time_ago):
    review_month = review_date[5:8]
    review_day = review_date[-20:-18].strip()

    if 'over' in review_time_ago:
        crop_offset_string = review_time_ago.split('over')[1].strip()
        offset_period = int(crop_offset_string[:2].strip())

        min_date = scrape_date - relativedelta(months=12*(offset_period+1))
        max_date = scrape_date - relativedelta(months=12*offset_period)

        candidate_years = [min_date.year, max_date.year]
        candidate_dates = [datetime.datetime.strptime(review_day + ' ' + review_month + ' ' + str(y), '%d %b %Y') for y in candidate_years]

        final_review_date = [d for d in candidate_dates if d > min_date and d < max_date][0]

    elif 'almost' in review_time_ago:
        crop_offset_string = review_time_ago.split('almost')[1].strip()
        offset_period = int(crop_offset_string[:2].strip())

        min_date = scrape_date - relativedelta(months=12*offset_period)
        max_date = scrape_date

        candidate_years = [min_date.year, max_date.year]
        candidate_dates = [datetime.datetime.strptime(review_day + ' ' + review_month + ' ' + str(y), '%d %b %Y') for y in candidate_years]

        final_review_date = [d for d in candidate_dates if d > min_date and d < max_date][0]      

    else:
        if 'about' in review_time_ago:
            crop_offset_string = review_time_ago.split('about')[1].strip()
            offset_period = int(crop_offset_string[:2].strip())
        else:
            offset_period = int(review_time_ago[:2].strip())

        if 'month' in review_time_ago:
            offset_scrape_date = scrape_date - relativedelta(months=offset_period)
        elif 'year' in review_time_ago:
            offset_scrape_date = scrape_date - relativedelta(months=12*offset_period)
        else:
            offset_scrape_date = scrape_date
        
        candidate_years = [offset_scrape_date.year - 1, offset_scrape_date.year, offset_scrape_date.year + 1]
        try:
            candidate_dates = [datetime.datetime.strptime(review_day + ' ' + review_month + ' ' + str(y), '%d %b %Y') for y in candidate_years]
        # in some fringe cases, we may be dealing with February 29th, which only exists on leap years
        except ValueError:
            candidate_dates = [datetime.datetime.strptime(str(int(review_day) - 1) + ' ' + review_month + ' ' + str(y), '%d %b %Y') for y in candidate_years]
        
        final_review_date = nearest(candidate_dates, offset_scrape_date)
    
    return final_review_date


# function to create a compound ID that uniquely identifies a wine by its vintage, review year 
def create_wine_year_id(wine_id, vintage, review_year):
    compound_id = str(wine_id) + '-' + str(vintage) + '-' + str(review_year)
    return compound_id


def clean_wine_reviews(review_df):
    review_df['final_review_date'] = review_df.apply(lambda x: compute_date(scrape_date, x['review_date'], x['review_time_ago']), axis=1)
    review_df['review_year'] = review_df['final_review_date'].apply(lambda x: x.year)

    # drop any reviews that don't have a vintage specified. N.V. is acceptable, but blank vintage is not. 
    review_df['vintage'].replace({'': np.nan}, inplace=True)
    review_df.dropna(subset=['vintage'], axis=0, inplace=True)

    review_df['wine_year_id'] = review_df.apply(lambda x: create_wine_year_id(x['wine_id'], x['vintage'], x['review_year']), axis=1)

    just_reviews = review_df[['reviewer', 'wine_year_id', 'vintage', 'review_year', 'wine_id', 'rating', 'final_review_date']]
    return just_reviews


In [8]:

from itertools import combinations


def compute_head_to_head_result(wine_0, wine_1, rating_0, rating_1):
    if rating_0 > rating_1:
        return wine_0
    elif rating_0 < rating_1:
        return wine_1
    else:
        return np.nan


def player_info_lookup(wine_year_id, score_lookup_table):
    score_lookup_table_filtered = score_lookup_table.loc[score_lookup_table['wine_year_id'] == wine_year_id]
    if score_lookup_table_filtered.empty:
        elo_rating = None
        tournament_number = 0
        nr_games_played = 0
        nr_wins = 0
        nr_losses = 0
    else:
        score_lookup = score_lookup_table_filtered.loc[score_lookup_table_filtered['tournament_number'] == max(score_lookup_table_filtered['tournament_number'])].iloc[0]
        elo_rating = score_lookup['elo_rating']
        tournament_number = score_lookup['tournament_number']
        nr_games_played = sum(score_lookup_table_filtered['nr_games_played'])
        nr_wins = sum(score_lookup_table_filtered['nr_wins'])
        nr_losses = sum(score_lookup_table_filtered['nr_losses'])
    
    return elo_rating, tournament_number, nr_games_played, nr_wins, nr_losses

def match_format(combo, review_table):
    id_0 = combo[0]
    id_1 = combo[1]
    rating_0 = review_table.at[combo[0], 'rating']
    rating_1 = review_table.at[combo[1], 'rating']

    # note: still need to fix this
    if isinstance(rating_0, float) and isinstance(rating_1, float):
        result = compute_head_to_head_result(id_0, id_1, rating_0, rating_1)
        match_result = ((id_0, id_1), result)
    
        return match_result


def run_tournaments(review_df, score_lookup_table):
    review_dates = sorted(list(set(review_df['final_review_date'])))
    for r in review_dates:
        review_df_date = review_df.loc[review_df['final_review_date'] == r]
        reviewers = sorted(list(set(review_df_date['reviewer'])))
        for u in reviewers:
            review_df_slice = review_df_date.loc[review_df_date['reviewer'] == u]
            # In some rare cases, an individual may have rated an individual wine more than once in one day. In this case, we eliminate one of these reviews
            review_df_slice = review_df_slice[~review_df_slice.index.duplicated(keep='first')]
            
            players = []
            unique_players = list(set(review_df_slice.index))

            for u in unique_players:
                rating, tournament_number, nr_games_played, nr_wins, nr_losses = player_info_lookup(u, score_lookup_table)
                p = uscf_elo.Player(u, rating, nr_games_played, nr_wins, nr_losses, tournament_number)
                players.append(p)

            combos = list(combinations(review_df_slice.index, 2))
            tournament_results = []
            for c in combos:
                match_result = match_format(c, review_df_slice)
                tournament_results.append(match_result)

            tournament = uscf_elo.Tournament(players=players, tournament_results=tournament_results, tournament_date=r)
            if tournament.valid_tournament:
                try:
                    updated_scores = tournament.run_tournament()                        
                    score_lookup_entry_table = pd.DataFrame(updated_scores, columns=['wine_year_id', 'tournament_date', 'tournament_number', 'nr_games_played', 'nr_wins', 'nr_draws', 'nr_losses', 'elo_rating'])
                    score_lookup_table = score_lookup_table.append(score_lookup_entry_table)
                except:
                    continue
    return score_lookup_table
                # [print(u) for u in updated_scores]

wine_reviews = clean_wine_reviews(wine_review_df).head(10000)
wine_reviews = wine_reviews.set_index(['wine_year_id'])
score_lookup_table_columns = ['wine_year_id', 'tournament_date', 'tournament_number', 'elo_rating', 'nr_games_played', 'nr_wins', 'nr_losses']
score_lookup_table = pd.DataFrame(columns=score_lookup_table_columns)
all_results = run_tournaments(wine_reviews, score_lookup_table)

# all_results.to_csv('all_results.csv')
print(all_results.shape)
print(all_results.sort_values(by=['tournament_number'], ascending=False).head(30))

(6606, 8)
          wine_year_id tournament_date tournament_number   elo_rating  \
0    1414284-2011-2013      2013-04-07                 0  1100.000000   
0  161613482-2018-2020      2020-07-13                 0  1700.000000   
3  164394152-2019-2020      2020-07-15                 0  1600.000000   
2  156570680-2018-2020      2020-07-15                 0  1600.000000   
1  156220915-2018-2020      2020-07-15                 0  1100.000000   
0  161083957-2019-2020      2020-07-15                 0  1100.000000   
2  147820657-2016-2020      2020-07-13                 0  1700.000000   
1    4702409-2012-2020      2020-07-13                 0  1300.000000   
0  150116080-2013-2020      2020-07-13                 0   900.000000   
3  161363487-2019-2020      2020-07-13                 0  1166.666667   
2  159813766-2018-2020      2020-07-13                 0  1166.666667   
1  159786937-2018-2020      2020-07-13                 0  1166.666667   
4  159542061-2019-2020      2020-07-12   

In [6]:
all_results.loc[all_results['wine_year_id'] == '156566236-2018-2021']

Unnamed: 0,wine_year_id,tournament_date,tournament_number,elo_rating,nr_games_played,nr_wins,nr_losses,nr_draws


In [7]:
day_results = results.loc[results['date'] == '2015-05-31']
print(day_results)

NameError: name 'results' is not defined

In [12]:
player = uscf_elo.Player(rating=None, nr_games_played=7, nr_wins=0, nr_losses=7)
print(player.initialized_rating)

1300
