In [1]:
import pandas as pd
from rapidfuzz import process, fuzz
from concurrent.futures import ThreadPoolExecutor
import math

In [3]:
df = pd.read_csv('new_players.csv').drop_duplicates().drop(columns='is_guest', errors='ignore')
df.head()

Unnamed: 0,user_id,event_id,entrant_name
0,248637.0,1115951,LAGr | Nain Jr
1,2235362.0,1115951,TACOLOREADO | WORF
2,2261338.0,1115951,BOD | Reina Rata
3,734197.0,1115951,TwT | Pooters
4,2207214.0,1115951,BungeeGumHN


In [5]:
# Pools
df2 = pd.read_csv('pools.csv')[['Player', 'Event_Id']]
df2 = df2.rename(columns={'Player': 'entrant_name',
                          'Event_Id': 'event_id'})
df2['user_id'] = 0

# Brackets
df3 = pd.read_csv('brackets.csv')[['Player 1', 'Event Id']].rename(columns={'Player 1': 'entrant_name',
                                                                            'Event Id': 'event_id'})
df4 = pd.read_csv('brackets.csv')[['Player 2', 'Event Id']].rename(columns={'Player 2': 'entrant_name',
                                                                            'Event Id': 'event_id'})

df3['user_id'] = -1
df4['user_id'] = -1

df = pd.concat([df, df2, df3, df4], axis=0).reset_index(drop=True).drop_duplicates()
df['event_id'] = df['event_id'].fillna(0).astype(int)
df['user_id'] = df['user_id'].fillna(-1).astype(int)
df.tail(10)

Unnamed: 0,user_id,event_id,entrant_name
33413,-1,51,Glypherson
33414,-1,51,Noka
33415,-1,51,CODEXgameing
33416,-1,51,ReverendSID
33417,-1,51,Fl0w
33418,-1,51,TheSaltySultan
33419,-1,51,Pogchampcringe
33420,-1,51,UchihaJorg
33421,-1,51,Solar
33422,-1,51,DrButtons


In [6]:
df['clean_name'] = df['entrant_name'].apply(lambda x: x.split('|')[-1].strip().lower())
df['name_ind'] = df['clean_name'].str[0]
df.head()

Unnamed: 0,user_id,event_id,entrant_name,clean_name,name_ind
0,248637,1115951,LAGr | Nain Jr,nain jr,n
1,2235362,1115951,TACOLOREADO | WORF,worf,w
2,2261338,1115951,BOD | Reina Rata,reina rata,r
3,734197,1115951,TwT | Pooters,pooters,p
4,2207214,1115951,BungeeGumHN,bungeegumhn,b


In [8]:
new_df = df[df['user_id'] == -1].reset_index()
master_list = df[df['user_id'] != -1].reset_index()

# Function to perform fuzzy matching and maintain associated data
def fuzzy_match(row, master_list):
    # Perform the fuzzy matching
    best_match = process.extractOne(row['clean_name'], master_list['clean_name'], scorer=fuzz.WRatio, score_cutoff=70)
    # Return match details along with event_id and entrant_name
    if best_match:
        matched_index = best_match[2]  # Get the index of the matched entry
        matched_data = master_list.iloc[matched_index]
        
        # Return match details along with event_id, entrant_name, and user_id from both sides
        return pd.Series([best_match[0], best_match[1], row['event_id'], row['entrant_name'], row['user_id'],
                          matched_data['entrant_name'], matched_data['user_id']],
                         index=['matched_name', 'score', 'event_id', 'entrant_name_input', 'user_id_input',
                                'entrant_name_matched', 'user_id_matched'])
    else:
        # Return NaNs or some form of indication for no match found
        return pd.Series([None, None, row['event_id'], row['entrant_name'], row['user_id'], None, None],
                         index=['matched_name', 'score', 'event_id', 'entrant_name_input', 'user_id_input',
                                'entrant_name_matched', 'user_id_matched'])

batch_size = 10000
results = []
for start in range(0, new_df.shape[0], batch_size):
    end = min(start + batch_size, new_df.shape[0])
    df_batch = new_df.iloc[start:end]
    batch_results = df_batch.apply(fuzzy_match, axis=1, master_list=master_list)
    results.append(batch_results)

# Concatenate all batch results into a single DataFrame
final_results = pd.concat(results)
final_results.to_csv('all_matches.csv')