In [1]:
import spacy 
import pandas as pd 
from collections import Counter
import en_core_web_lg
import numpy as np
import regex as re

#### Defining Function to Find Entities

In [2]:
def find_persons(text):
    # Create Doc object
    doc2 = nlp(text)

    # Identify the persons
    persons = [ent.text for ent in doc2.ents if ent.label_ == 'Player']

    # Return persons
    return persons

In [3]:
def find_clubs(text):
    # Create Doc object
    doc2 = nlp(text)

    # Identify the persons
    clubs = [ent.text for ent in doc2.ents if ent.label_ == 'CLUB']

    # Return persons
    return clubs

#### Clubs NER 

In [4]:
clubs = pd.read_csv("C:/Users/Titoran/Documents/Dictionaries/Football_Leagues.csv")

In [5]:
#Pre-processing the clubs dictionary which is partly done in Excel to add entries

#Converting to list
arr = clubs.to_numpy().flatten()
clubs = list(arr)

#Cleaning the text
clubs = [str(w).replace('\xa0', ' ') for w in clubs]
clubs = [x for x in clubs if str(x) != 'nan']

#Removing trailing spaces
clubs = [str(w).strip() for w in clubs]

In [6]:
#Making sure there is no weird alphabet
from unidecode import unidecode
clubs = [unidecode(w) for w in clubs]

In [7]:
# Import EntityRuler class
from spacy.pipeline import EntityRuler
from spacy.language import Language
nlp = spacy.blank("en")

ruler = nlp.add_pipe("entity_ruler")

In [8]:
#Creating the dictionary and the labels using the club data set we imported
for word in clubs:
    ruler.add_patterns([{"label":"CLUB", "pattern": word}])

#### Players NER

In [9]:
players_2020 = pd.read_csv("C:/Users/Titoran/Documents/Cleaned Players Data/Players_2020.csv", index_col = [0])
players_2021 = pd.read_csv("C:/Users/Titoran/Documents/Cleaned Players Data/Players_2021.csv", index_col = [0])
players_2022 = pd.read_csv("C:/Users/Titoran/Documents/Cleaned Players Data/Players_2022.csv", index_col = [0])
player_list = (list(players_2020["Player"]) + list(players_2021["Player"]) + list(players_2022["Player"]))

In [10]:
#Getting all players which are not one word
temp = []
for element in player_list:
    if np.char.count(element, " ") > 0 : 
        temp.append(element)

In [11]:
#Players which are one word
temp2 = []
for element in players_2021["Player"]:
    if np.char.count(element, " ") == 0 : 
        temp2.append(element)

In [12]:
#Players I still want to add
manual_entries = ["Thiago", "Alisson", "Neymar", "Denilson", "Vitinho", "Martinelli", "Hulk", "Koke", "Raphinha", "Casemiro", "Marcelo", "Jorginho", "Ederson", "Willian", "Antony", "Marquinhos"]

In [13]:
player_list = temp + manual_entries

In [14]:
#Since we add all the players from different years, we have many duplicate names to delete. Unique returns only one instance of each value in a list
def unique(list1):
    # insert the list to the set
    list_set = set(list1)
    # convert the set to the list
    unique_list = (list(list_set))
    return unique_list

In [15]:
player_list = unique(player_list)

In [16]:
#Making sure there is no weird alphabet
player_list = [unidecode(w) for w in player_list]

In [17]:
#Creating the dictionary and the labels using the players data set we imported
for word in player_list:
    ruler.add_patterns([{"label":"Player", "pattern": word}])

#### Storing the clubs and players

In [18]:
df = pd.read_csv("C:/Users/Titoran/Documents/1.Rumours/Rumours.csv", index_col = [0] )

In [19]:
clubs_involved = []
for row in df.iloc[:,0]:
    clubs_involved.append(find_clubs(row))

In [20]:
players_involved = []
for row in df.iloc[:,0]:
    players_involved.append(find_persons(row))

#### Consolidating the Data + Filter

In [21]:
df["Clubs"] = clubs_involved
df["Player"] = players_involved

In [22]:
df.to_csv('Accuracy_Testing.csv', encoding='utf-8')

In [23]:
#list(df.loc[df["Player"].apply(len) == 0, "Rumour"])

In [24]:
#Only keeping when one player is mentioned
df = df.loc[df["Player"].apply(len) == 1]

#Only keeping when more than one club is mentioned
df = df.loc[df["Clubs"].apply(len) > 1]

In [25]:
df.to_csv('Rumours_NER.csv', encoding='utf-8')