In [1]:
import spacy 
import pandas as pd 
from collections import Counter
import en_core_web_lg
import numpy as np
import regex as re

#### Defining Function to Find Entities

In [2]:
def find_persons(text):
    # Create Doc object
    doc2 = nlp(text)

    # Identify the persons
    persons = [ent.text for ent in doc2.ents if ent.label_ == 'Player']

    # Return persons
    return persons

In [3]:
def find_clubs(text):
    # Create Doc object
    doc2 = nlp(text)

    # Identify the persons
    clubs = [ent.text for ent in doc2.ents if ent.label_ == 'CLUB']

    # Return persons
    return clubs

#### Clubs NER 

In [4]:
clubs = pd.read_csv("C:/Users/Titoran/Documents/Dictionaries/Football_Leagues.csv")

In [5]:
arr = clubs.to_numpy().flatten()
clubs = list(arr)

In [6]:
clubs = [str(w).replace('\xa0', '') for w in clubs]
clubs = [x for x in clubs if str(x) != 'nan']

In [7]:
from unidecode import unidecode
clubs = [unidecode(w) for w in clubs]

In [8]:
# Import EntityRuler class
from spacy.pipeline import EntityRuler
from spacy.language import Language
nlp = spacy.blank("en")


ruler = nlp.add_pipe("entity_ruler")

In [9]:
for word in clubs:
    ruler.add_patterns([{"label":"CLUB", "pattern": word}])

#### Players NER

In [10]:
players_2020 = pd.read_csv("C:/Users/Titoran/Documents/Cleaned Players Data/Players_2020.csv")

In [11]:
players_2021 = pd.read_csv("C:/Users/Titoran/Documents/Cleaned Players Data/Players_2021.csv")

In [12]:
players_2022 = pd.read_csv("C:/Users/Titoran/Documents/Cleaned Players Data/Players_2022.csv")

In [13]:
player_list = (list(players_2020["Player"]) + list(players_2021["Player"]) + list(players_2022["Player"]))

In [14]:
def unique(list1):
    # insert the list to the set
    list_set = set(list1)
    # convert the set to the list
    unique_list = (list(list_set))
    return unique_list

In [15]:
player_list = unique(player_list)

In [16]:
player_list = [unidecode(w) for w in player_list]

In [17]:
for word in player_list:
    ruler.add_patterns([{"label":"Player", "pattern": word}])

#### Storing the clubs and players

In [18]:
df = pd.read_csv("C:/Users/Titoran/Documents/1.Rumours/Rumours.csv", index_col = [0] )

In [19]:
df["Rumour"] =[unidecode(w) for w in df["Rumour"]]

In [20]:
clubs_involved = []
for row in df.iloc[:,0]:
    clubs_involved.append(find_clubs(row))

In [21]:
players_involved = []
for row in df.iloc[:,0]:
    players_involved.append(find_persons(row))

#### Consolidating the Data

In [22]:
df["Clubs"] = clubs_involved
df["Player"] = players_involved

In [28]:
df.loc[df["Player"].apply(len) == 0, "Rumour"]

16      Arsenal are prepared to spend nearly PS250m on...
36      Liverpool have completed the signing of Ghanai...
41      Swansea manager Steve Cooper is top of Fulham'...
67      Crystal Palace have held positive talks with L...
75      Liverpool have no plans to sell Kostas Tsimika...
                              ...                        
3295    However, Leeds want at least PS35m for Harriso...
3297    Nottingham Forest are also considering a late ...
3298    Women's Super League champions Chelsea are try...
3317    Bournemouth are close to agreeing a fee with S...
3318    The Cherries are also in talks with Dynamo Kyi...
Name: Rumour, Length: 376, dtype: object

In [30]:
testing[0,1]

KeyError: 'key of type tuple not found and not a MultiIndex'

In [23]:
#Only keeping when one player is mentioned
df = df.loc[df["Player"].apply(len) == 1]

In [24]:
df = df.loc[df["Clubs"].apply(len) > 1]

In [25]:
df.to_csv('Rumours_NER.csv', encoding='utf-8')

#### Adding The Player Info

In [26]:
df_NER = pd.read_csv("Rumours_NER.csv", index_col = [0] )

In [27]:
#Unlisting the players to then join them
#df_NER["Player"] = [str(''.join(map(str, l))) for l in df_NER['Player']]

In [28]:
df_NER["Player"] = df_NER["Player"].apply(lambda x: x[2:-2])

In [29]:
df2020 = df_NER.loc[df_NER["Year"] == 2020,]

In [30]:
df2021 = df_NER.loc[df_NER["Year"] == 2021,]

In [31]:
df2022 = df_NER.loc[df_NER["Year"] == 2022,]

In [32]:
df0 = df2020.merge(players_2020, how='left', on='Player')

In [33]:
df0 = df0.drop(['Year_y'], axis=1)
df0 = df0.rename(columns={'Year_x': 'Year'})

In [34]:
df1 = df2021.merge(players_2021, how='left', on='Player')

In [35]:
df1 = df1.drop(['Year_y'], axis=1)
df1 = df1.rename(columns={'Year_x': 'Year'})

In [36]:
df2 = df2022.merge(players_2022, how='left', on='Player')

In [37]:
df2 = df2.drop(['Year_y'], axis=1)
df2 = df2.rename(columns={'Year_x': 'Year'})

In [38]:
df_players = pd.concat([ df0, df1, df2], axis=0)

#### Finding the interested club

In [40]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [43]:
df_players["Clubs"] = [l.replace("PSG", "Paris Saint-Germain") for l in df_players["Clubs"]]

In [51]:
from ast import literal_eval

In [52]:
n = 0
interested_club = []
for row in df_players["Clubs"] : 
    temp = []
    for club in literal_eval(row) : 
        if (similar(club, str(df_players.iloc[n,12])) < 0.7) & (club not in str(df_players.iloc[n,12]))  :
            temp.append(club)
    interested_club.append(temp)
    n += 1

In [54]:
df_players["Interested_clubs"] = interested_club

In [55]:
df_players.iloc[200:210,[3,4,12,13]]

Unnamed: 0,Clubs,Player,Club,Interested_clubs
200,"['Watford', 'Torino']",Lyanco,Torino,[Watford]
201,"['Manchester United', 'Flamengo']",Andreas Pereira,SS Lazio,"[Manchester United, Flamengo]"
202,"['Tottenham', 'Lyon', 'Juventus']",Houssem Aouar,Lyon,"[Tottenham, Juventus]"
203,"['Arsenal', 'Juventus', 'Sassuolo']",Manuel Locatelli,US Sassuolo,"[Arsenal, Juventus]"
204,"['Tottenham', 'Eindhoven']",Noni Madueke,PSV Eindhoven,[Tottenham]
205,"['Arsenal', 'Chelsea', 'Roma']",Granit Xhaka,Arsenal,"[Chelsea, Roma]"
206,"['Tottenham', 'Juventus']",Weston McKennie,Juventus,[Tottenham]
207,"['Manchester United', 'Wolves']",Ruben Neves,Wolves,[Manchester United]
208,"['Burnley', 'Lyon']",Maxwel Cornet,Lyon,[Burnley]
209,"['Roma', 'Barcelona']",Clement Lenglet,Barcelona,[Roma]


#### Checking for the Transfer

In [57]:
transfers = pd.read_csv("C:/Users/Titoran/Documents/Transfers Data/transfers.csv", index_col = [0])

In [58]:
Outcome = []
n = 0 
for player in df_players["Player"] :
    temp = 0 
    try : 
        for interested in df_players.iloc[n,13] : 
            for club in transfers.loc[(transfers["player_name"] == player), "club_name"] :
                if ((similar(club, str(interested)) > 0.7) | (club in str(interested))):  
                    temp = 1
        if temp == 1 : 
            Outcome.append("True")
        else : 
            Outcome.append("False")
    except IndexError : 
            Outcome.append("False")
    n += 1

In [59]:
df_players["Outcome"] = Outcome

In [61]:
#df_players.loc[df_players["Outcome"] == "True",]

Unnamed: 0,Rumour,Year,Outlet,Clubs,Player,League,Player Link,Position,Age,Country,Join,Market Value,Club,Interested_clubs,Outcome
2,Sergio Ramos is a target for Manchester United...,2020,90min,"['Manchester United', 'Paris Saint-Germain', '...",Sergio Ramos,LaLiga,https://www.transfermarkt.co.uk/sergio-ramos/p...,Centre-Back,35.0,Spain,6148.0,10.0,Real Madrid,"[Manchester United, Paris Saint-Germain, Sevilla]",True
3,Real Madrid and France defender Raphael Varane...,2020,Goal,"['Real Madrid', 'Manchester United', 'Chelsea']",Raphael Varane,LaLiga,https://www.transfermarkt.co.uk/raphael-varane...,Centre-Back,28.0,France,4018.0,70.0,Real Madrid,"[Manchester United, Chelsea]",True
8,"Arsenal's French midfielder Matteo Guendouzi, ...",2020,Fabrizio Romano via Sun,"['Arsenal', 'Hertha', 'Marseille']",Matteo Guendouzi,Bundesliga,https://www.transfermarkt.co.uk/matteo-guendou...,Central Midfield,22.0,France,634.0,20.0,Hertha,"[Arsenal, Marseille]",True
14,Manchester United have made a new bid of more ...,2020,Mail on Sunday,"['Manchester United', 'Dortmund']",Jadon Sancho,Bundesliga,https://www.transfermarkt.co.uk/jadon-sancho/p...,Left Winger,21.0,England,1765.0,100.0,Borussia Dortmund,[Manchester United],True
18,"Portugal and Wolves goalkeeper Rui Patricio, 3...",2020,Sky Sports - in Italian,"['Wolves', 'Roma']",Rui Patricio,Premier League,https://www.transfermarkt.co.uk/rui-patricio/p...,Goalkeeper,33.0,Portugal,1461.0,10.0,Wolves,[Roma],True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
880,Crystal Palace are interested in bringing Chel...,2022,Sky Sports,"['Crystal Palace', 'Chelsea']",Conor Gallagher,Premier League,https://www.transfermarkt.co.uk/conor-gallaghe...,Central Midfield,22.0,England,336.0,25.0,Crystal Palace,[Chelsea],True
891,Newcastle and Tottenham could move for 29-year...,2022,Calciomercato - in Italian,"['Tottenham', 'Inter Milan', 'Chelsea']",Romelu Lukaku,Premier League,https://www.transfermarkt.co.uk/romelu-lukaku/...,Centre-Forward,29.0,Belgium,323.0,70.0,Chelsea,"[Tottenham, Inter Milan]",True
904,Sporting Lisbon will replace Porro with fellow...,2022,Fabrizio Romano,"['Sporting', 'Barcelona']",Hector Bellerin,Premier League,https://www.transfermarkt.co.uk/hector-belleri...,Right-Back,27.0,Spain,2922.0,20.0,Arsenal,"[Sporting, Barcelona]",True
910,Southampton have agreed to meet the PS26.2m re...,2022,Football Insider,"['Southampton', 'Braga']",Vitinha,Liga Portugal,https://www.transfermarkt.co.uk/vitinha/profil...,Central Midfield,22.0,Portugal,904.0,30.0,Porto,"[Southampton, Braga]",True


In [63]:
df_players.to_csv('Uncleaned_Data.csv', encoding='utf-8')