# Crawling and Scraping using Beautiful Soup
Beautiful Soup library was used to crawl the website and scrape, clean and organize the data into a pandas dataframe. After which, the dataframe was written into a 'CSV' file.

Beautiful Soup enabled us to create objects out of webpages. To extract attributes from each player, the player URL was used to crawl across webpages. Beautiful Soup provides us with functions like find, find_all to extract data from the html script according to the classes they are organizedin in the web script. The data was extracted and organized into a dictionary. Attributes such as height, weight, player skills, etc were extracted subsequently.




In [19]:
from bs4 import BeautifulSoup as bs
import requests
import re


def soup_maker(url):
    '''
    Returns a beautiful soup object of the webpage when the webpage URL is passed. The data is scraped
    from the beautiful soup object by the rest of the functions
    '''
    assert isinstance(url,str)
    r = requests.get(url)
    markup = r.content
    soup = bs(markup, 'lxml')
    return soup


def playerall(soup):
    '''
    Finds the URL of players from the soup object. Passes the URL to player_all_details which returns a dictionary
    with all the player details
    '''
    assert isinstance(soup,bs)
    
    final_details = {}
    table = soup.find('table', {'class': 'table table-hover persist-area'})
    tbody = table.find('tbody')
    all_a = tbody.find_all('a')
    result=[]
    for player in all_a:
        final_details['short_name'] = player.text
        if player['href'][0:8] == '/player/':
            final_details.update(player_all_details('http://sofifa.com' + player['href']))
            dict2 = final_details.copy()
            result.append(dict2)
    return result

def player_all_details(url):
    '''
    Takes url of a player as the input. Returns all the player details using other like, playerskills.
    '''
    assert isinstance(url,str)
    all_details = {}
    soup = soup_maker(url)
    player_info = soup.find('div', {'class': 'player'})
    all_details.update(player_basic(player_info))   
    player_stats = soup.find('div', {'class': 'stats'})
    all_details.update(player_primary(player_stats))
    secondary_info = soup.find('div', {'class': 'teams'})
    all_details.update(player_secondary(secondary_info))
    dict1=playerskills(url)
    all_details.update(dict1)
    return(all_details)


def player_basic(soup):
    '''
    Returns the age, height, weight and preferred position of the player
    '''
    
    player_data = {}
    player_data['image'] = soup.find('img')['data-src']
    player_data['full_name'] = soup.find('h1').text.split(' (')[0]
    span = soup.find('span', attrs={'class': None}).text.strip()
    dob = re.search('(\(.*)\)', span).group(0)
    player_data['dob'] = dob.replace('(', '').replace(')', '')
    infos = span.replace(dob + ' ', '').split(' ')
    infos.append(infos[-1])
    infos[-2] = infos[-3]
    infos[-3] = infos[-4]
    infos[-4] = infos[-5][-3:-1]
    infos[-4] = infos[-4]+infos[-5][-1]
    infos[-5] = infos[-5][:-3]
    player_data['pref_pos'] = infos[:infos.index('Age')]
    player_data['age'] = int(infos[infos.index('Age') + 1: -2][0])
    player_data['height'] = int((infos[infos.index('Age') + 2: -1][0]).replace('cm', ''))
    player_data['weight'] = int((infos[infos.index('Age') + 3:][0]).replace('kg', ''))
    return(player_data)


def player_primary(soup):
    '''
    Returns the palyers rating, potential, value and wage
    '''
    #assert isinstance(soup,bs)
    player_data = {}
    info = re.findall('\d+', soup.text)
    player_data['rating'] = int(info[0])
    player_data['potential'] = int(info[1])
    player_data['value'] = int(info[2])
    if len(info)== 5:
        player_data['wage'] = int(info[4])*1000
    else:
        player_data['wage'] = int(info[3])*1000
    return(player_data)

def player_secondary(soup):
    '''
    Returns the players country and preferred foot
    '''
    #assert isinstance(soup,bs)
    player_data = {}
    #print(soup)
    player_data['preff_foot'] = soup.find('label', text='Preferred Foot')\
        .parent.contents[2].strip('\n ')
    temp = ''.join([i for i in str(soup)])
    temp1 = temp.split('>')
    temp2 = [i[i.find('title')+6:] for i in temp1 if i[:len('\n<a href="/teams?')]=='\n<a href="/teams?' ]       
    player_data['country'] = temp2
    return(player_data)


def playerskills(url):
    '''
    Returns all the player skills like tackling, finishing, shot power, crossing, volley etc
    '''
    assert isinstance(url,str)
    all_details = {}
    soup = soup_maker(url)
    a = soup.find_all('div', {'class': 'mb-20'})
    li = []
    k=0
    for i in a:
        b = i.find_all('li')
        for j in b:
            li.append(j.text)
    dict1={}
    j=0
    for i in li:
        if j>=34:
            break
        a=i.split()
        if len(a)==2:
            dict1.update({str(a[1]):int(a[0])})
        else:
            key=str(a[-2])+str(a[-1])
            dict1.update({key:int(a[0])})
        j+=1
        
    return dict1




In [21]:
### Finds all the player details from the main webpage of sofifa
### Stores all the player information in a dictionary

result_final=[]
players={}
for i in range(0):
    print(i)
    url = 'http://sofifa.com/players?offset='+str(80*i)
    soup = soup_maker(url)
    result=playerall(soup)
    for i in range (0, len(result)):
        players.update({result[i]['full_name']:result[i]})
    print("Length of players is", len(players))

In [221]:
## Converts the dataset into a panda dataframe and writes into a CSV file
import pandas as pd
playerlist=[(v) for k, v in players.items()]
my_df = pd.DataFrame(playerlist)

In [222]:
my_df.to_csv('players1.csv')