In [1]:
import pandas as pd 
import numpy as np
import regex as re

In [2]:
df = pd.read_csv("Outlets_Cleaned.csv", index_col = [0] )

#### Handling NA Values

In [17]:
#Sometimes there is no join date, therefore rather than removing these entries we take the average join date and input that
round(df.describe(),0)
df.loc[df['Join'].isna() == True, "Join"] = df['Join'].median()

In [18]:
df = df.dropna().reset_index(drop=True)

In [3]:
#Rows wich have the same rumour by the same outlet
df = df.drop_duplicates(subset=['Outlet','Year','Player','Interested_clubs'], keep='first')

#### Converting the positions to Attack, Midfield, Defence

In [6]:
Attackers = ["Left Winger", "Centre-Forward", "Right Winger", "Second Striker"]
Midfielders = ['Attacking Midfield', 'Central Midfield','Left Midfield', 'Defensive Midfield','Right Midfield']
Defenders = ['Left-Back','Right-Back', 'Centre-Back', 'Goalkeeper']

In [7]:
df.loc[df["Position"].isin(Attackers) , "Position"] = "Attacker"
df.loc[df["Position"].isin(Midfielders) , "Position"] = "Midfielder"
df.loc[df["Position"].isin(Defenders) , "Position"] = "Defender"

#### Adding Player Agents

In [8]:
from bs4 import BeautifulSoup
import requests

In [9]:
headers = {
    # 'Host' : 'https://www.zim.com/',
    'Connection': 'keep-alive',
    'User-Agent': 'Chrome/102.0.5005.63 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.5 (KHTML, like Gecko) Safari/536.5',
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate',
    'Content-Type': 'text',
    'Accept-Language': 'en-US,en;q=0.8'
}

def get_data(URL):
    soup = BeautifulSoup(requests.get(URL, headers=headers).content, 'lxml')
    agent = 'Unknown'
    try:
        agent = soup.find('span', {'onclick': 'tmEvent("spielerprofil", "click", "berater-spielerdaten")'}).text.replace('\n', '')
    except:
        pass
    
    try:
        agent = soup.find('a', {'onclick': 'tmEvent("spielerprofil", "click", "berater-spielerdaten")'}).text.replace('\n', '')
    except:
        pass
    
    return agent

In [10]:
#In case a line causes problems
#df = df.drop(df.index[395])

In [11]:
df.insert(3, "Agent", agent)

In [231]:
df.to_csv('Agents_Data.csv', encoding='utf-8')

#### Data for Regression

In [3]:
df_reg = pd.read_csv('Agents_Data.csv', index_col = [0] )

In [14]:
df_reg = df_reg.drop_duplicates(subset=['Outlet','Year','Player'], keep='first')

In [15]:
len(df_reg)

1638

In [16]:
n = 20

In [17]:
df_reg.loc[df_reg["League"].value_counts()[df_reg["League"]].values < n, "League"] = "Other"
df_reg.loc[df_reg["Country"].value_counts()[df_reg["Country"]].values < n, "Country"] = "Other"
df_reg.loc[df_reg["Outlet"].value_counts()[df_reg["Outlet"]].values < n, "Outlet"] = "Other"
df_reg.loc[df_reg["Agent"].value_counts()[df_reg["Agent"]].values < 10, "Agent"] = "Other"

In [18]:
#n = 14
#df_reg = df_reg.groupby('League').filter(lambda x : len(x)>n)
#df_reg = df_reg.groupby('Country').filter(lambda x : len(x)>n)
#df_reg = df_reg.groupby('Outlet').filter(lambda x : len(x)>n)
#df_reg = df_reg.groupby('Agent').filter(lambda x : len(x)>7)

In [19]:
#Storing for later
continuous = df_reg[["Market Value", "Join", "Age"]]

In [20]:
df_reg = df_reg.iloc[:,[2,3,6,8,10,15]]
#df_reg = df_reg.drop(['Rumour', 'Year', 'Player', 'Club', 'Interested_clubs', "Market Value", "Join", "Age"], axis=1)

In [21]:
features = pd.get_dummies(df_reg)

In [22]:
df_reg = pd.concat([continuous, features], axis=1)

In [23]:
df_reg.to_csv('Regression_Data.csv', encoding='utf-8')

#### Categorical Variables Thresholds

In [5]:
df_analysis = pd.read_csv('Agents_Data.csv', index_col = [0] )

In [6]:
df_analysis = df_analysis.drop_duplicates(subset=['Outlet','Year','Player'], keep='first')

In [7]:
df_analysis = df_analysis.drop(['Rumour', 'Year', 'Player', 'Club', 'Interested_clubs', 'Clubs', 'Player Link'], axis=1)

In [None]:
#Checking lengths of categorical variables
print(len(df_analysis["League"].value_counts()))
print(len(df_analysis["Country"].value_counts()))
print(len(df_analysis["Outlet"].value_counts()))
print(len(df_analysis["Agent"].value_counts()))
print(len(df_analysis))

In [None]:
n = 20

In [None]:
df_analysis.loc[df_analysis["League"].value_counts()[df_analysis["League"]].values < n, "League"] = "Other"
df_analysis.loc[df_analysis["Country"].value_counts()[df_analysis["Country"]].values < n, "Country"] = "Other"
df_analysis.loc[df_analysis["Outlet"].value_counts()[df_analysis["Outlet"]].values < n, "Outlet"] = "Other"
df_analysis.loc[df_analysis["Agent"].value_counts()[df_analysis["Agent"]].values < 10, "Agent"] = "Other"

In [None]:
#If we wish to remove low occuring entries rather than renaming then

#df_analysis = df_analysis.groupby('League').filter(lambda x : len(x)>n)
#df_analysis = df_analysis.groupby('Country').filter(lambda x : len(x)>n)
#df_analysis = df_analysis.groupby('Outlet').filter(lambda x : len(x)>n)
#df_analysis = df_analysis.groupby('Agent').filter(lambda x : len(x)>10)

In [None]:
df_analysis["Outlet"].value_counts()

In [None]:
#Shortening Names
df_analysis.loc[df_analysis['Outlet'].str.contains("Independent Journalists"), "Outlet"] = "Journalists"
df_analysis.loc[df_analysis['Outlet'].str.contains("Fabrizio Romano"), "Outlet"] = "F. Romano" 
df_analysis.loc[df_analysis['Outlet'].str.contains("Manchester Evening News"), "Outlet"] = "Manchester N." 
df_analysis.loc[df_analysis['Outlet'].str.contains("Evening Standard"), "Outlet"] = "Evening Std" 
df_analysis.loc[df_analysis['Outlet'].str.contains("Liverpool Echo"), "Outlet"] = "Liverpool E." 
df_analysis.loc[df_analysis['Outlet'].str.contains("Corriere dello Sport"), "Outlet"] = "Corriere S." 
df_analysis.loc[df_analysis['Outlet'].str.contains("Football Insider"), "Outlet"] = "Insider" 
df_analysis.loc[df_analysis['Outlet'].str.contains("Calcio mercato"), "Outlet"] = "Calcio M." 
df_analysis.loc[df_analysis['Outlet'].str.contains("Mundo Deportivo"), "Outlet"] = "Mundo D." 

In [12]:
df_analysis.to_csv('Analysis_Data.csv', encoding='utf-8')