In [9]:
import pandas as pd
import numpy as np

In [10]:
df = pd.read_csv('main_task.csv')

In [17]:
# Make function to obtain the link (return string)
def get_url(df, idx):
    """ Returns full link 
        df - dataframe
        isx - index of row to extract the link"""
    
    return 'https://www.tripadvisor.com/' + df['URL_TA'][idx]

link = get_url(df, 320)
link

'https://www.tripadvisor.com//Restaurant_Review-g187849-d7734583-Reviews-Gioby_Pizzeria-Milan_Lombardy.html'

python modules need to be installed by pip or conda  

- conda install requests  
- conda install beautifulsoup4

In [12]:
import time        # measure finish and start  execution time
import requests    # get link
import bs4         # access HTML code

In [13]:
def get_evaluation(df):
    """ Get dataframe and obtain new features based on the evaluation """
    
    # Make new column with URL
    df['URL'] = df["URL_TA"].apply(lambda x: 'https://www.tripadvisor.com/' + x)
    
    # Create new columns
    df['nr_reviews'] = 0
    df['excellent'] = 0
    df['v_good'] = 0
    df['average'] = 0
    df['poor'] = 0
    df['terrible'] = 0
    
    #listik = []
    
    for item in df.index:

        # get the link and access the html page
        res = requests.get(df["URL"][item])
        test_read = bs4.BeautifulSoup(res.text, 'html.parser')
        
        
        # get info from the internet
        review = test_read.select('#REVIEWS > div.prw_rup.prw_common_location_content_header_resp > div > div.title_text')
        excellent = test_read.select('#taplc_detail_filters_rr_resp_0 > div > div.noncollapsible > div.prw_rup.prw_filters_detail_checkbox > div > div.content > div > div:nth-child(1) > span.row_num')
        vg = test_read.select('#taplc_detail_filters_rr_resp_0 > div > div.noncollapsible > div.prw_rup.prw_filters_detail_checkbox > div > div.content > div > div:nth-child(2) > span.row_num')
        av = test_read.select('#taplc_detail_filters_rr_resp_0 > div > div.noncollapsible > div.prw_rup.prw_filters_detail_checkbox > div > div.content > div > div:nth-child(3) > span.row_num')
        poor = test_read.select('#taplc_detail_filters_rr_resp_0 > div > div.noncollapsible > div.prw_rup.prw_filters_detail_checkbox > div > div.content > div > div:nth-child(4) > span.row_num')
        ter = test_read.select('#taplc_detail_filters_rr_resp_0 > div > div.noncollapsible > div.prw_rup.prw_filters_detail_checkbox > div > div.content > div > div:nth-child(5) > span.row_num')
        
        # in case of bad link or incorrect paper or 0 reviews
        if review == [] or review[0].getText() == 'Reviews  (0)' or review[0].getText() == 'Be the first to write a review ':
            # Set value for each row
            df.at[item,'nr_reviews'] = 0
            df.at[item, 'excellent'] = 0
            df.at[item, 'v_good'] = 0
            df.at[item, 'average'] = 0
            df.at[item, 'poor'] = 0
            df.at[item, 'terrible'] = 0
        
        # in case of a good link
        else:
            # Set value for each row
            df.at[item,'nr_reviews'] = int(review[0].getText().replace('Reviews ', '').replace(' (', '').replace(')', '').replace(',',''))
            df.at[item, 'excellent'] = int(excellent[0].getText().replace(',',''))
            df.at[item, 'v_good'] = int(vg[0].getText().replace(',',''))
            df.at[item, 'average'] = int(av[0].getText().replace(',',''))
            df.at[item, 'poor'] = int(poor[0].getText().replace(',',''))
            df.at[item, 'terrible'] = int(ter[0].getText().replace(',',''))
            
    return df

In [14]:
# MAke a new df for test
new_df = df[0:10].copy()

start = time.time()
a = get_evaluation(new_df)
finish = time.time()

print(f'Total time to execute the code is {finish - start} seconds')

Total time to execute the code is 145.71943497657776 seconds


### Multiprocess

In [15]:
import multiprocessing as mp
from multiprocessing import  Pool

print("Number of processors: ", mp.cpu_count())

Number of processors:  4


In [16]:
# Parrallel computing with pandas. Split dataset into core count
# Simultaniously takse data from the internet
# joins back the dataset
def parallelize_dataframe(df, func, n_cores=20):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [None]:
# MAke a new df for test
new_df = df[0:10].copy()

start = time.time()
train = parallelize_dataframe(new_df, get_evaluation)
finish = time.time()

print(f'Total time to execute the code is {finish - start} seconds')
train