Step 1: Data Preprocessing
Load the Data:

Load the news data and the three CSV files into your preferred data analysis tool (e.g., Python using pandas).
Clean the Data:

Handle missing values, remove duplicates, and correct any inconsistencies.
For the news data, focus on the content column for text analysis.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
%matplotlib inline
import seaborn as sns

import itertools
import re
import string
import pickle
import os

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.model_selection import KFold

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords, wordnet
from wordcloud import WordCloud
from copy import deepcopy

from IPython.display import (
    Markdown as md,
    Latex,
    HTML,
)

from tqdm.auto import tqdm
import json

# set plot style
sns.set()

from google.colab import drive

drive.mount('/content/drive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Mounted at /content/drive


In [None]:
player_detail = pd.read_csv('/content/drive/MyDrive/nlp data/Copy of player_detail.csv')
transfer_history = pd.read_csv('/content/drive/MyDrive/nlp data/Copy of tm_player_transfer_history.csv')
player_market_value_development = pd.read_csv('/content/drive/MyDrive/nlp data/Copy of tm_player_market_value_development.csv')

# miao_feed_all_docs = pd.read_csv('/content/drive/MyDrive/nlp data/Data/miao_feed_all_docs_dropped.csv')
# news_feed_all_docs = pd.read_csv('/content/drive/MyDrive/nlp data/Data/news_feed_all_docs_dropped.csv')

## DEAL WITH PLAYER INFORMATION

### Player Detail

In [None]:
player_detail.head(2)

Unnamed: 0,Name,player_id,DOB,Age,Foot,Height (m),Nationality,Citizenship,Position,Other positions,Team,Joined,Contract expires
0,Thibaut Courtois,108390,1992-05-11,31.0,left,2.0,Belgium,['Belgium'],Goalkeeper,,Real Madrid,2018-08-09,2026-06-30
1,Andriy Lunin,404839,1999-02-11,25.0,right,1.91,Ukraine,['Ukraine'],Goalkeeper,,Real Madrid,2018-07-01,2025-06-30


In [None]:
player_detail.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20319 entries, 0 to 20414
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Name              20319 non-null  object        
 1   player_id         20319 non-null  int64         
 2   DOB               20319 non-null  datetime64[ns]
 3   Age               20319 non-null  float64       
 4   Foot              20319 non-null  object        
 5   Height_m          20319 non-null  float64       
 6   Nationality       20319 non-null  object        
 7   Citizenship       20319 non-null  object        
 8   Position          20319 non-null  object        
 9   Other positions   20319 non-null  object        
 10  Team              20319 non-null  object        
 11  Joined            20319 non-null  datetime64[ns]
 12  Contract expires  20319 non-null  datetime64[ns]
dtypes: datetime64[ns](3), float64(2), int64(1), object(7)
memory usage: 2.2+ MB


In [None]:
player_detail.isnull().sum()

Name                   0
player_id              0
DOB                    3
Age                    3
Foot                1885
Height (m)           853
Nationality            0
Citizenship            0
Position               0
Other positions     6561
Team                   0
Joined               134
Contract expires    2712
dtype: int64

In [None]:
# Cleaning player_details
player_detail['DOB'].fillna('1900-01-01', inplace=True)
player_detail['Age'].fillna(player_detail['Age'].mean(), inplace=True)
player_detail['Foot'].fillna('Unknown', inplace=True)
player_detail['Height (m)'].fillna(player_detail['Height (m)'].mean(), inplace=True)
player_detail['Other positions'].fillna('None', inplace=True)
player_detail['Joined'].fillna('1900-01-01', inplace=True)
player_detail['Contract expires'].fillna('1900-01-01', inplace=True)

# Convert DOB, Joined, and Contract expires to datetime
player_detail['DOB'] = pd.to_datetime(player_detail['DOB'])
player_detail['Joined'] = pd.to_datetime(player_detail['Joined'])
player_detail['Contract expires'] = pd.to_datetime(player_detail['Contract expires'])

In [None]:
# Filter rows where both DOB and Age are null
# null_dob_age = player_detail[player_detail['DOB'].isnull() & player_detail['Age'].isnull()]

# null_dob_age.head(6)

Unnamed: 0,Name,player_id,DOB,Age,Foot,Height (m),Nationality,Citizenship,Position,Other positions,Team,Joined,Contract expires


### PLAYER TRANSFER HISTORY


In [None]:
# Cleaning transfer_history
# transfer_history['from_latitude'].fillna(0, inplace=True)
# transfer_history['from_longitude'].fillna(0, inplace=True)
# transfer_history['to_latitude'].fillna(0, inplace=True)
# transfer_history['to_longitude'].fillna(0, inplace=True)
transfer_history['date'].fillna('1900-01-01', inplace=True)
transfer_history['season'].fillna('Unknown', inplace=True)
transfer_history['fee'].fillna('Unknown', inplace=True)
transfer_history['Transfer Fee'].fillna(transfer_history['Transfer Fee'].mean(), inplace=True)

# Convert date and dateUnformatted to datetime
transfer_history['date'] = pd.to_datetime(transfer_history['date'])
# transfer_history['dateUnformatted'] = pd.to_datetime(transfer_history['dateUnformatted'])

In [None]:
def fill_zero_coordinates(df, tolerance=1e-6):
    # Function to count zeros in the DataFrame
    def count_zeros(df):
        zero_from_lat = (df['from_latitude'].abs() < tolerance).sum()
        zero_from_lon = (df['from_longitude'].abs() < tolerance).sum()
        zero_to_lat = (df['to_latitude'].abs() < tolerance).sum()
        zero_to_lon = (df['to_longitude'].abs() < tolerance).sum()
        return zero_from_lat, zero_from_lon, zero_to_lat, zero_to_lon

    # Count zeros before filling
    before_zeros = count_zeros(df)
    print("Number of zeros before filling:")
    print(f"from_latitude: {before_zeros[0]}, from_longitude: {before_zeros[1]}, to_latitude: {before_zeros[2]}, to_longitude: {before_zeros[3]}")

    # Identify zero coordinates
    zero_from_lat = df['from_latitude'].abs() < tolerance
    zero_from_lon = df['from_longitude'].abs() < tolerance
    zero_to_lat = df['to_latitude'].abs() < tolerance
    zero_to_lon = df['to_longitude'].abs() < tolerance

    # Replace zero coordinates with NaN for easier processing
    df.loc[zero_from_lat, 'from_latitude'] = np.nan
    df.loc[zero_from_lon, 'from_longitude'] = np.nan
    df.loc[zero_to_lat, 'to_latitude'] = np.nan
    df.loc[zero_to_lon, 'to_longitude'] = np.nan

    # Function to get main club name by stripping common feeder team suffixes
    def get_main_club_name(club_name):
        # List of common feeder team suffixes to strip
        common_suffixes = ['U19', 'U17', 'Youth', 'B', 'C', 'II', 'Reserves']
        # Iterate through suffixes and remove them from club_name
        for suffix in common_suffixes:
            club_name = re.sub(rf'\b{suffix}\b', '', club_name, flags=re.IGNORECASE).strip()
        return club_name

    # Apply the function to both 'from' and 'to' club names
    df['main_from_clubName'] = df['from_clubName'].apply(get_main_club_name)
    df['main_to_clubName'] = df['to_clubName'].apply(get_main_club_name)

    # Create a combined DataFrame to fill coordinates
    combined_from = df[['main_from_clubName', 'from_latitude', 'from_longitude']].copy()
    combined_from.rename(columns={'main_from_clubName': 'clubName', 'from_latitude': 'latitude', 'from_longitude': 'longitude'}, inplace=True)

    combined_to = df[['main_to_clubName', 'to_latitude', 'to_longitude']].copy()
    combined_to.rename(columns={'main_to_clubName': 'clubName', 'to_latitude': 'latitude', 'to_longitude': 'longitude'}, inplace=True)

    combined = pd.concat([combined_from, combined_to], ignore_index=True)

    # Fill NaN values with the mean of the respective club's coordinates
    combined['latitude'] = combined.groupby('clubName')['latitude'].transform(lambda x: x.fillna(x.mean()))
    combined['longitude'] = combined.groupby('clubName')['longitude'].transform(lambda x: x.fillna(x.mean()))

    # Fill any remaining NaNs with zero (in case there are clubs with only NaNs)
    combined['latitude'].fillna(0, inplace=True)
    combined['longitude'].fillna(0, inplace=True)

    # Debug prints to check the filled results
    print("Combined DataFrame with filled NaNs:")
    print(combined)

    # Split the combined DataFrame back into from and to parts
    filled_from = combined.iloc[:len(df)]
    filled_to = combined.iloc[len(df):]

    # Merge the filled coordinates back into the original DataFrame
    df['from_latitude'] = filled_from['latitude'].values
    df['from_longitude'] = filled_from['longitude'].values
    df['to_latitude'] = filled_to['latitude'].values
    df['to_longitude'] = filled_to['longitude'].values

    # Count zeros after filling
    after_zeros = count_zeros(df)
    print("Number of zeros after filling:")
    print(f"from_latitude: {after_zeros[0]}, from_longitude: {after_zeros[1]}, to_latitude: {after_zeros[2]}, to_longitude: {after_zeros[3]}")

    return df

In [None]:
transfer_history_filled = fill_zero_coordinates(transfer_history)

Number of zeros before filling:
from_latitude: 20340, from_longitude: 20340, to_latitude: 10557, to_longitude: 10557
Combined DataFrame with filled NaNs:
               clubName   latitude  longitude
0               Chelsea  51.477520  -0.201590
1       Atlético Madrid  40.401730  -3.720635
2               Chelsea  51.477520  -0.201590
3              KRC Genk  51.005026   5.533332
4              KRC Genk  51.005026   5.533332
...                 ...        ...        ...
302867   ACSM Poli Iasi  47.186059  27.562736
302868   ACSM Poli Iasi  47.186059  27.562736
302869     FV Eppelborn  49.403136   6.970649
302870   ACSM Poli Iasi  47.186059  27.562736
302871     Rapid Brodoc  46.639394  27.730951

[302872 rows x 3 columns]
Number of zeros after filling:
from_latitude: 15121, from_longitude: 15121, to_latitude: 7585, to_longitude: 7585


In [None]:
# Fill missing 'date' values with 'dateUnformatted'
transfer_history_filled['date'] = transfer_history_filled['date'].fillna(transfer_history_filled['dateUnformatted'])

# Fill missing 'Transfer Fee' and 'fee' values with zero
transfer_history_filled['Transfer Fee'] = transfer_history_filled['Transfer Fee'].fillna(0)
transfer_history_filled['fee'] = transfer_history_filled['fee'].fillna(0)

In [None]:
# Convert datum_mw, highest_date, and last_change to datetime
player_market_value_development['datum_mw'] = pd.to_datetime(player_market_value_development['datum_mw'])
player_market_value_development['highest_date'] = pd.to_datetime(player_market_value_development['highest_date'])
player_market_value_development['last_change'] = pd.to_datetime(player_market_value_development['last_change'])

In [None]:
# Remove duplicates
player_detail.drop_duplicates(inplace=True)
transfer_history.drop_duplicates(inplace=True)
player_market_value_development.drop_duplicates(inplace=True)

In [None]:
# Optionally rename columns
player_detail.rename(columns={'Height (m)': 'Height_m'}, inplace=True)
transfer_history.rename(columns={'Transfer Fee': 'Transfer_Fee'}, inplace=True)
player_market_value_development.rename(columns={'Highest market value': 'Highest_market_value'}, inplace=True)

In [None]:
player_detail.to_csv('/content/drive/MyDrive/nlp data/Data/player_detail_cleaned.csv')
transfer_history.to_csv('/content/drive/MyDrive/nlp data/Data/transfer_history_cleaned.csv')
player_market_value_development.to_csv('/content/drive/MyDrive/nlp data/Data/player_market_value_development_cleaned')

In [None]:
player_market_value_development.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 280334 entries, 0 to 280333
Data columns (total 8 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   mw                    280334 non-null  float64
 1   datum_mw              280334 non-null  object 
 2   verein                280334 non-null  object 
 3   age                   280334 non-null  int64  
 4   player_id             280334 non-null  int64  
 5   Highest market value  280334 non-null  float64
 6   highest_date          280334 non-null  object 
 7   last_change           280334 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 17.1+ MB
