# Cleaning the Premier League datasets from 2017-2020

To start with we will focus on the Premier League seasons only and in later notebook we will focus on other leagues and players. In this notebook we will focus on extracting the data from raw JSON format into a more readable format. All source files are stored in the 'src_files/Football_Players' directory which were retrieved from the public dataset available on Kaggle : https://www.kaggle.com/datasets/diegobartoli/top5legauesplayers-statsandphys

In [3]:
import pandas as pd
import os
import matplotlib.pylab as plt
import seaborn as sns
import json 
import numpy


%matplotlib inline 

# Import Data from raw JSON

In [7]:
data_pl_2017 = pd.read_json("../src_files/Football_Players/2017PremierLeague.json",lines=True)
data_pl_2018 = pd.read_json("../src_files/Football_Players/2018PremierLeague.json",lines=True)
data_pl_2019 = pd.read_json("../src_files/Football_Players/2019PremierLeague.json",lines=True)
data_pl_2020 = pd.read_json("../src_files/Football_Players/2020PremierLeague.json",lines=True)

In [8]:
data_pl_2017.head(5)

Unnamed: 0,_id,age,defensive_stats,general_stats,height,name,nationality,offensive_stats,passing_stats,position,team,weight
0,{'$oid': '61eda24e19e7e448017988d0'},33,"{'Tkl': 36.0, 'TklW': 23.0, 'Past': 8.0, 'Pres...","{'games': 32.0, 'time': 2817.0, 'red_cards': 0...",186,Wes Morgan,Jamaica,"{'goals': 0.0, 'xG': 1.9300000000000002, 'assi...","{'Cmp': 739.0, 'Cmp%': 84.1, '1/3': 35.0, 'PPA...",DF,Leicester,101
1,{'$oid': '61eda24e19e7e448017988d1'},25,"{'Tkl': 18.0, 'TklW': 16.0, 'Past': 14.0, 'Pre...","{'games': 20.0, 'time': 1179.0, 'red_cards': 0...",172,Jack Wilshere,England,"{'goals': 1.0, 'xG': 1.16, 'assists': 3.0, 'xA...","{'Cmp': 703.0, 'Cmp%': 86.6, '1/3': 64.0, 'PPA...",MF,Arsenal,68
2,{'$oid': '61eda24e19e7e448017988d2'},31,"{'Tkl': 18.0, 'TklW': 12.0, 'Past': 18.0, 'Pre...","{'games': 25.0, 'time': 1842.0, 'red_cards': 0...",178,Andrew Surman,England,"{'goals': 2.0, 'xG': 0.99, 'assists': 5.0, 'xA...","{'Cmp': 997.0, 'Cmp%': 84.4, '1/3': 97.0, 'PPA...",MF,Bournemouth,73
3,{'$oid': '61eda24e19e7e448017988d3'},34,"{'Tkl': 19.0, 'TklW': 13.0, 'Past': 18.0, 'Pre...","{'games': 35.0, 'time': 2204.0, 'red_cards': 0...",183,Glenn Murray,England,"{'goals': 12.0, 'xG': 12.38, 'assists': 0.0, '...","{'Cmp': 380.0, 'Cmp%': 58.0, '1/3': 18.0, 'PPA...",FW,Brighton,80
4,{'$oid': '61eda24e19e7e448017988d4'},36,"{'Tkl': 47.0, 'TklW': 27.0, 'Past': 28.0, 'Pre...","{'games': 25.0, 'time': 1935.0, 'red_cards': 0...",183,Gareth Barry,England,"{'goals': 1.0, 'xG': 1.3599999999999999, 'assi...","{'Cmp': 720.0, 'Cmp%': 75.9, '1/3': 99.0, 'PPA...",MF,West Bromwich Albion,78


In [9]:
data_pl_2018.head(5)

Unnamed: 0,_id,age,defensive_stats,general_stats,height,name,nationality,offensive_stats,passing_stats,position,team,weight
0,{'$oid': '61eda25019e7e44801798eb5'},34,"{'Tkl': 32.0, 'TklW': 17.0, 'Past': 2.0, 'Pres...","{'games': 22.0, 'time': 1932.0, 'red_cards': 2...",186.0,Wes Morgan,Jamaica,"{'goals': 3.0, 'xG': 1.54, 'assists': 0.0, 'xA...","{'Cmp': 742.0, 'Cmp%': 83.5, '1/3': 30.0, 'PPA...",DF,Leicester,101.0
1,{'$oid': '61eda25019e7e44801798eb6'},33,"{'Tkl': 24.0, 'TklW': 15.0, 'Past': 15.0, 'Pre...","{'games': 17.0, 'time': 1220.0, 'red_cards': 0...",183.0,Simon Francis,England,"{'goals': 0.0, 'xG': 0.12, 'assists': 2.0, 'xA...","{'Cmp': 598.0, 'Cmp%': 74.0, '1/3': 56.0, 'PPA...",DF,Bournemouth,90.0
2,{'$oid': '61eda25019e7e44801798eb7'},32,"{'Tkl': 15.0, 'TklW': 10.0, 'Past': 12.0, 'Pre...","{'games': 18.0, 'time': 1440.0, 'red_cards': 0...",178.0,Andrew Surman,England,"{'goals': 0.0, 'xG': 0.27, 'assists': 0.0, 'xA...","{'Cmp': 725.0, 'Cmp%': 87.2, '1/3': 58.0, 'PPA...",MF,Bournemouth,73.0
3,{'$oid': '61eda25019e7e44801798eb8'},35,"{'Tkl': 14.0, 'TklW': 7.0, 'Past': 19.0, 'Pres...","{'games': 38.0, 'time': 2517.0, 'red_cards': 0...",183.0,Glenn Murray,England,"{'goals': 13.0, 'xG': 11.54, 'assists': 1.0, '...","{'Cmp': 378.0, 'Cmp%': 60.7, '1/3': 25.0, 'PPA...",FW,Brighton,80.0
4,{'$oid': '61eda25019e7e44801798eb9'},30,"{'Tkl': 11.0, 'TklW': 8.0, 'Past': 9.0, 'Press...","{'games': 15.0, 'time': 843.0, 'red_cards': 1....",188.0,Kevin McDonald,Scotland,"{'goals': 0.0, 'xG': 0.11, 'assists': 0.0, 'xA...","{'Cmp': 488.0, 'Cmp%': 85.8, '1/3': 42.0, 'PPA...",MF,Fulham,82.0


In [10]:
data_pl_2019.head(5)

Unnamed: 0,_id,age,defensive_stats,general_stats,height,name,nationality,offensive_stats,passing_stats,position,team,weight
0,{'$oid': '61eda25119e7e448017994e0'},34,"{'Tkl': 17.0, 'TklW': 12.0, 'Past': 14.0, 'Pre...","{'games': 15.0, 'time': 956.0, 'red_cards': 1....",183.0,Simon Francis,England,"{'goals': 0.0, 'xG': 0.15, 'assists': 0.0, 'xA...","{'Cmp': 425.0, 'Cmp%': 78.1, '1/3': 26.0, 'PPA...",DF,Bournemouth,90.0
1,{'$oid': '61eda25119e7e448017994e1'},36,"{'Tkl': 3.0, 'TklW': 2.0, 'Past': 6.0, 'Press'...","{'games': 23.0, 'time': 761.0, 'red_cards': 0....",183.0,Glenn Murray,England,"{'goals': 1.0, 'xG': 2.61, 'assists': 1.0, 'xA...","{'Cmp': 116.0, 'Cmp%': 65.5, '1/3': 7.0, 'PPA'...",FW,Brighton,80.0
2,{'$oid': '61eda25119e7e448017994e2'},32,"{'Tkl': 25.0, 'TklW': 17.0, 'Past': 12.0, 'Pre...","{'games': 18.0, 'time': 1007.0, 'red_cards': 0...",183.0,Ahmed Elmohamady,Egypt,"{'goals': 1.0, 'xG': 0.2, 'assists': 1.0, 'xA'...","{'Cmp': 484.0, 'Cmp%': 76.5, '1/3': 44.0, 'PPA...",DF,Aston Villa,75.0
3,{'$oid': '61eda25119e7e448017994e3'},31,"{'Tkl': 7.0, 'TklW': 5.0, 'Past': 5.0, 'Press'...","{'games': 24.0, 'time': 1462.0, 'red_cards': 0...",173.0,Sergio Agüero,Argentina,"{'goals': 16.0, 'xG': 15.56, 'assists': 3.0, '...","{'Cmp': 284.0, 'Cmp%': 77.4, '1/3': 12.0, 'PPA...",FW,Manchester City,70.0
4,{'$oid': '61eda25119e7e448017994e4'},33,"{'Tkl': 25.0, 'TklW': 12.0, 'Past': 27.0, 'Pre...","{'games': 20.0, 'time': 1503.0, 'red_cards': 1...",180.0,Adrian Mariappa,Jamaica,"{'goals': 0.0, 'xG': 0.17, 'assists': 0.0, 'xA...","{'Cmp': 426.0, 'Cmp%': 72.8, '1/3': 32.0, 'PPA...",DF,Watford,78.0


In [11]:
data_pl_2020.head(5)

Unnamed: 0,_id,age,defensive_stats,general_stats,height,name,nationality,offensive_stats,passing_stats,position,team,weight
0,{'$oid': '61eda25219e7e44801799abc'},31,"{'Tkl': 3.0, 'TklW': 3.0, 'Past': 0.0, 'Press'...","{'games': 19.0, 'time': 522.0, 'red_cards': 0....",183.0,Hal Robson-Kanu,Wales,"{'goals': 2.0, 'xG': 1.5, 'assists': 0.0, 'xA'...","{'Cmp': 64.0, 'Cmp%': 74.4, '1/3': 7.0, 'PPA':...",FW,West Bromwich Albion,83.0
1,{'$oid': '61eda25219e7e44801799abd'},27,"{'Tkl': 8.0, 'TklW': 5.0, 'Past': 10.0, 'Press...","{'games': 21.0, 'time': 512.0, 'red_cards': 0....",180.0,Alireza Jahanbakhsh,Iran,"{'goals': 0.0, 'xG': 1.87, 'assists': 1.0, 'xA...","{'Cmp': 196.0, 'Cmp%': 74.0, '1/3': 4.0, 'PPA'...",FW,Brighton,76.0
2,{'$oid': '61eda25219e7e44801799abe'},24,"{'Tkl': 41.0, 'TklW': 26.0, 'Past': 17.0, 'Pre...","{'games': 31.0, 'time': 2732.0, 'red_cards': 1...",192.0,Joachim Andersen,Denmark,"{'goals': 1.0, 'xG': 1.3599999999999999, 'assi...","{'Cmp': 1529.0, 'Cmp%': 83.4, '1/3': 121.0, 'P...",DF,Fulham,90.0
3,{'$oid': '61eda25219e7e44801799abf'},33,"{'Tkl': 0.0, 'TklW': 0.0, 'Past': 1.0, 'Press'...","{'games': 37.0, 'time': 3330.0, 'red_cards': 0...",190.0,Vicente Guaita,Spain,"{'goals': 0.0, 'xG': 0.0, 'assists': 0.0, 'xA'...","{'Cmp': 598.0, 'Cmp%': 55.4, '1/3': 7.0, 'PPA'...",GK,Crystal Palace,81.0
4,{'$oid': '61eda25219e7e44801799ac0'},28,"{'Tkl': 37.0, 'TklW': 20.0, 'Past': 16.0, 'Pre...","{'games': 27.0, 'time': 1424.0, 'red_cards': 0...",178.0,Jeffrey Schlupp,Ghana,"{'goals': 2.0, 'xG': 2.02, 'assists': 3.0, 'xA...","{'Cmp': 304.0, 'Cmp%': 70.9, '1/3': 31.0, 'PPA...",MF,Crystal Palace,72.0


At the minute we will only focus on the premier league players and seasons from 2017-2020. We've imported the raw data from JSON as seen above for the first 5 rows for each season we are working with. We can see that the data is hard to read so we will need to do some data manipulation to convert the JSON into readable format