# Web Scraping Futbin.com

1. [Loading our Data and Libraries](#ld) <br>
2. [Scraping the Futbin-ID](#Id) <br>
3. [Scraping the Time-Series Data](#ts) <br>

***

## Loading/Preparing our Data and Libraries
<a id="ld" > 

In [2]:
import pandas as pd
import numpy as np
import requests
import json
import requests
from datetime import datetime
from bs4 import BeautifulSoup

In [3]:
fifa = pd.read_csv('FIFA19 - Ultimate Team players.csv', low_memory=False)

In [112]:
fifa.head()

Unnamed: 0,player_ID,player_name,player_extended_name,quality,revision,origin,overall,club,league,nationality,...,rf,lf,rw,lw,st,price_ps4,price_xbox,price_pc,traits,specialties
0,2,Maradona,Diego Maradona,Gold - Rare,Icon,,97,Icons,Icons,Argentina,...,94,94,94,94.0,90,,,,"Avoids Using Weaker Foot, Finesse Shot, Flair,...","Speedster, Dribbler, Play Maker, Distance Shoo..."
1,3,Ronaldo,Nazário de Lima Ronaldo Luís,Gold - Rare,Icon,,96,Icons,Icons,Brazil,...,94,94,92,92.0,94,,15000000.0,,"Tries To Beat Defensive Line, Finesse Shot","Speedster, Dribbler, Distance Shooter, FK Spec..."
2,4,Pelé,Arantes Nascimento Edson,Gold - Rare,Icon,,95,Icons,Icons,Brazil,...,94,94,94,94.0,92,9250000.0,6200000.0,,Finesse Shot,"Speedster, Dribbler, Distance Shooter, Crosser..."
3,5,Maradona,Diego Maradona,Gold - Rare,Icon,,95,Icons,Icons,Argentina,...,92,92,92,92.0,88,5000000.0,2799000.0,,"Avoids Using Weaker Foot, Finesse Shot, Flair,...","Dribbler, Play Maker, Distance Shooter, Crosse..."
4,6,Maldini,Paolo Maldini,Gold - Rare,Icon,,94,Icons,Icons,Italy,...,68,68,69,69.0,70,,,,Team Player,"Aerial Threat, Tackler, Tactician, Complete De..."


In [5]:
fifa['player_ID'].duplicated().sum()

0

In [36]:
# Eliminating incorrect entries
fifa = fifa[fifa['player_ID']!=210]
fifa = fifa[fifa['player_ID']!=16581]

In [37]:
player_ids = fifa.player_ID.values

In [38]:
# Splitting the data so we don't have to retrieve all of the data at once
ids1, ids2, ids3, ids4, ids5, ids6, ids7, ids8, ids9, ids10 = np.array_split(player_ids, 10)

***

## Scraping the Futbin-ID
<a id="ld" > 

In [19]:
domain = 'https://www.futbin.com'
version = 19
section = 'player'

FUTBIN_IDs=[]
PLAYER_IDs=[]


# We need to get the special Futbin-ID for each player in order to subsequently scrape the price data
def get_futbin_ID(player_ids):
    for ID in player_ids:
        url = "%s/%s/%s/%s"% (domain, version, section, ID)
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'html.parser')
        FUTBIN_ID = soup.find("div", {"id": "page-info"})['data-player-resource']
        FUTBIN_IDs.append(FUTBIN_ID)
        PLAYER_IDs.append(ID)  
        # Printing the Progress
        if len(FUTBIN_IDs)%200==0:
            print len(FUTBIN_IDs)

In [21]:
get_futbin_ID(ids1)

200
400
600
800
1000
1200
1400
1600


In [22]:
get_futbin_ID(ids2)

1800
2000
2200
2400
2600
2800
3000
3200


In [23]:
get_futbin_ID(ids3)

3400
3600
3800
4000
4200
4400
4600
4800


In [24]:
get_futbin_ID(ids4)

5000
5200
5400
5600
5800
6000
6200
6400


In [25]:
get_futbin_ID(ids5)

6600
6800
7000
7200
7400
7600
7800
8000
8200


In [26]:
get_futbin_ID(ids6)

8400
8600
8800
9000
9200
9400
9600
9800


In [27]:
get_futbin_ID(ids7)

10000
10200
10400
10600
10800
11000
11200
11400


In [28]:
get_futbin_ID(ids8)

11600
11800
12000
12200
12400
12600
12800
13000


In [30]:
get_futbin_ID(ids9)

14800
15000
15200
15400
15600
15800
16000
16200


In [39]:
get_futbin_ID(ids10)

17800
18000
18200
18400
18600
18800
19000
19200


In [40]:
len(FUTBIN_IDs)

19308

In [41]:
len(PLAYER_IDs)

19308

In [42]:
# Creating a Dataframe for the IDs
ID_df = pd.DataFrame({'Futbin_ID': FUTBIN_IDs,
                      'player_ID': PLAYER_IDs})

In [49]:
ID_df.duplicated().sum()

0

In [48]:
ID_df = ID_df.drop_duplicates()

In [45]:
fifa.shape

(16462, 83)

In [50]:
ID_df.shape

(16462, 2)

In [51]:
# Saving the Player- and Futbin-ID as CSV
ID_df.to_csv('Player_Futbin_ID.csv')

In [52]:
# Appending the original Fifa data with the Futbin-ID
fifa_w_futbin_ID = pd.merge(fifa, ID_df, on='player_ID')

In [113]:
fifa_w_futbin_ID.head()

Unnamed: 0,player_ID,player_name,player_extended_name,quality,revision,origin,overall,club,league,nationality,...,lf,rw,lw,st,price_ps4,price_xbox,price_pc,traits,specialties,Futbin_ID
0,2,Maradona,Diego Maradona,Gold - Rare,Icon,,97,Icons,Icons,Argentina,...,94,94,94.0,90,,,,"Avoids Using Weaker Foot, Finesse Shot, Flair,...","Speedster, Dribbler, Play Maker, Distance Shoo...",190042
1,3,Ronaldo,Nazário de Lima Ronaldo Luís,Gold - Rare,Icon,,96,Icons,Icons,Brazil,...,94,92,92.0,94,,15000000.0,,"Tries To Beat Defensive Line, Finesse Shot","Speedster, Dribbler, Distance Shooter, FK Spec...",37576
2,4,Pelé,Arantes Nascimento Edson,Gold - Rare,Icon,,95,Icons,Icons,Brazil,...,94,94,94.0,92,9250000.0,6200000.0,,Finesse Shot,"Speedster, Dribbler, Distance Shooter, Crosser...",237068
3,5,Maradona,Diego Maradona,Gold - Rare,Icon,,95,Icons,Icons,Argentina,...,92,92,92.0,88,5000000.0,2799000.0,,"Avoids Using Weaker Foot, Finesse Shot, Flair,...","Dribbler, Play Maker, Distance Shooter, Crosse...",237074
4,6,Maldini,Paolo Maldini,Gold - Rare,Icon,,94,Icons,Icons,Italy,...,68,69,69.0,70,,,,Team Player,"Aerial Threat, Tackler, Tactician, Complete De...",238439


In [55]:
# Saving the updated Fifa data as CSV
fifa_w_futbin_ID.to_csv('Fifa_w_Futbin_ID.csv')

***

## Scraping the Time-Series Data
<a id="ts" > 

In [56]:
fifa_w_futbin_ID['Futbin_ID'].duplicated().sum()

0

In [58]:
# We make a dictionairy out of both the IDs
names_ids= dict(zip(fifa_w_futbin_ID['player_name'], fifa_w_futbin_ID['Futbin_ID']))

In [59]:
# Source: https://gist.github.com/miloir/2196917
def split_dict_equally(input_dict, chunks=2):
    "Splits dict by keys. Returns a list of dictionaries."
    # prep with empty dicts
    return_list = [dict() for idx in xrange(chunks)]
    idx = 0
    for k,v in input_dict.iteritems():
        return_list[idx][k] = v
        if idx < chunks-1:  # indexes start at 0
            idx += 1
        else:
            idx = 0
    return return_list

In [61]:
# Splittting the dcitionairy into multiple parts so we dont have to retrieve all of the data at once
dict1, dict2, dict3, dict4, dict5, dict6, dict7, dict8, dict9, dict10 = split_dict_equally(names_ids, chunks=10)

In [65]:
# We use the Futbin-ID to scrape the time sereis data for each player
dates=[]
prices=[]
names=[]
IDs=[]

def get_prices(name_ID):
    for (name, ID) in name_ID.items():
        r = requests.get('https://www.futbin.com/19/playerGraph?type=daily_graph&year=19&player={0}'.format(ID))
        data = r.json()
        #Change ps to xbox or pc to get other prices
        for price in data['ps']:
            #There is extra zeroes in response.
            date = datetime.utcfromtimestamp(price[0] / 1000).strftime('%Y-%m-%d')
            dates.append(date)
            price = price[1]
            prices.append(price)
            names.append(name)
            IDs.append(ID)
            # Printing the progress
            if len(names)%1000==0 :
                print(len(names))

In [66]:
get_prices(dict1)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000


In [80]:
get_prices(dict2)

43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000


In [81]:
get_prices(dict3)

64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000


In [82]:
get_prices(dict4)

85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000


In [85]:
get_prices(dict5)

106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000


In [86]:
get_prices(dict6)

127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000


In [89]:
get_prices(dict7)

148000
149000
150000
151000
152000
153000
154000
155000
156000
157000
158000
159000
160000
161000
162000
163000
164000
165000
166000
167000
168000


In [94]:
get_prices(dict8)

169000
170000
171000
172000
173000
174000
175000
176000
177000
178000
179000
180000
181000
182000
183000
184000
185000
186000
187000
188000
189000


In [95]:
get_prices(dict9)

190000
191000
192000
193000
194000
195000
196000
197000
198000
199000
200000
201000
202000
203000
204000
205000
206000
207000
208000
209000
210000


In [97]:
get_prices(dict10)

228000
229000
230000
231000
232000
233000
234000
235000
236000
237000
238000
239000
240000
241000
242000
243000
244000
245000
246000
247000
248000


In [98]:
# We create a dataframe with the time-series price data for each player
time_series_df = pd.DataFrame({'Futbin_ID': IDs,
                               'Name': names,
                               'price': prices,
                               'date': dates})

In [101]:
time_series_df.duplicated().sum()

0

In [100]:
time_series_df = time_series_df.drop_duplicates()

In [102]:
# We save the time-series price data for each player
time_series_df.to_csv('fifa_price_time_series.csv')

In [103]:
time_series_df.shape

(210715, 4)

In [109]:
# We create a dataframe with the time-series data and all other information available for the player
time_series_full = pd.merge(time_series_df, fifa_w_futbin_ID, how='left', on=['Futbin_ID'])

In [105]:
df_try.shape

(210715, 87)

In [114]:
df_try.head()

Unnamed: 0,Futbin_ID,Name,date,price,player_ID,player_name,player_extended_name,quality,revision,origin,...,rf,lf,rw,lw,st,price_ps4,price_xbox,price_pc,traits,specialties
0,242147,Piñones,2018-09-21,200,12833,Piñones,Wilson Piñones,Bronze,Normal,,...,53,53,54,54.0,52,200.0,200.0,,,
1,242147,Piñones,2018-09-22,200,12833,Piñones,Wilson Piñones,Bronze,Normal,,...,53,53,54,54.0,52,200.0,200.0,,,
2,242147,Piñones,2018-09-23,200,12833,Piñones,Wilson Piñones,Bronze,Normal,,...,53,53,54,54.0,52,200.0,200.0,,,
3,242147,Piñones,2018-09-24,200,12833,Piñones,Wilson Piñones,Bronze,Normal,,...,53,53,54,54.0,52,200.0,200.0,,,
4,242147,Piñones,2018-09-25,200,12833,Piñones,Wilson Piñones,Bronze,Normal,,...,53,53,54,54.0,52,200.0,200.0,,,


In [110]:
# We save the full time-series data as CSV
time_series_full.to_csv('time_series_full.csv')