In [1]:
import pandas as pd
import requests
import wykop
import collections
from bs4 import BeautifulSoup
from datetime import datetime
from cfg import appkey, secretkey, acckey

### Scrap comments from single entry (no API keys required)

In [2]:
def parse_color(color):
    if color.startswith('color'):
        return int(color.split('-')[-1])
    else:
        return None

def get_comments_from_entry(entry_id, topic=None):
    try:
        URL = f'https://www.wykop.pl/wpis/{entry_id}'
        page = requests.get(URL)

        soup = BeautifulSoup(page.content, "html.parser")
        body = soup.find('body')
        ultag = body.find_all('ul', {'class': 'comments-stream'})

        comments_list = []
        if ultag:
            ultag = ultag[0]
            litag = ultag.find_all('li')[0]
            ultag2 = body.find('ul', {'class': 'sub'})
            comment = ultag2.find_all('div', {'class': ['wblock lcontrast dC authorComment', 'wblock lcontrast dC']})
            for ul in comment:
                author = ul.find('div', {'class': 'author ellipsis'})
                author_login = author.find('b').get_text()
                if (color := author.a.attrs.get("class")) is not None:
                    color = color[0]
                else:
                    color = author.b.attrs.get("class")[0]

                author_color = parse_color(color)
                author_sex = ul.img.attrs.get('class')[1]
                blocked = False
                vote_count = author.find('p', {'class': 'vC'}).attrs.get('data-vc')
                comments_count = None
                content = ul.find('div', {'class': 'text'}).find('p').get_text().strip()
                date = datetime.strptime(author.find('time').attrs.get('title'), "%Y-%m-%d %H:%M:%S")
                with_image = (media_content := ul.find('div', {'class': 'media-content'})) is not None
                image_url = media_content.a.attrs.get('href') if with_image else None

                entry_data = {
                    'id': entry_id,
                    'type': 'comment',
                    'content': content,
                    'date': date,
                    'author': author_login,
                    'author_color': author_color,
                    'author_sex': author_sex,
                    'blocked': blocked,
                    'vote_count': vote_count,
                    'comments_count': comments_count,
                    'with_image': with_image,
                    'image_url': image_url
                }

                if topic is not None:
                    entry_data['topic'] = topic

                comments_list.append(entry_data)
        return comments_list
    except Exception as e:
       print(entry_id, e)
       return []


### Scrap entries content (without comments) from search engine using using Wykop API v2

In [3]:
api = wykop.WykopAPI(appkey, secretkey)
api.authenticate(acckey)

In [4]:
def parse_data_from_entry(entry, topic=None):
    entry_id = entry['id']
    content = entry['body']
    date = entry['date']
    author = entry['author']
    vote_count = entry['vote_count']
    comments_count = entry['comments_count'] if 'comments_count' in entry.keys() else None
    with_image = entry['embed']['type'] == 'image' if 'embed' in entry.keys() else False
    image_url = entry['embed']['url'] if with_image else None

    entry_data = {
        'id': entry_id,
        'type': 'entry',
        'content': content,
        'date': date,
        'author': author['login'],
        'author_color': author['color'],
        'author_sex': author['sex'] if 'sex' in author.keys() else None, 
        'blocked': entry['blocked'],
        'vote_count': vote_count,
        'comments_count': comments_count,
        'with_image': with_image,
        'image_url': image_url,
    }

    if topic is not None:
        entry_data['topic'] = topic

    return entry_data

def get_dataset(keyword_group, min_page, max_page, max_date, users_activity_counts, with_comments):
    transformed_response = []
    end = False
    keyword, search_query = keyword_group
    for page in range(min_page, max_page+1):
        response = api.search_entries(page=page, query=search_query)
        for post in response:
            date_str = datetime.strptime(post['date'], "%Y-%m-%d %H:%M:%S")
            if date_str < max_date:
                print(f'Stopping {keyword} on page {page}, last text date: {date_str}, max date: {max_date}')
                end = True
                break

            entry_data = parse_data_from_entry(post, topic=keyword)
            transformed_response.append(entry_data)
            users_activity_counts[entry_data['author']][keyword] += 1

            if entry_data['comments_count'] > 0 and with_comments:
                comments = get_comments_from_entry(entry_data['id'], topic=keyword)
                for comm in comments:
                    transformed_response.append(comm)
                    users_activity_counts[comm['author']][keyword] += 1

        if end:
            break

    dataset = pd.DataFrame(transformed_response)
    return dataset

### Users selection

In [5]:
def get_most_active_users(users_activity_counts, keywords, min_activity_per_keyword, min_activity_sum, verbose=False):
    most_active_users = []
    for user, activity in users_activity_counts.items():
        end = False
        activity_sum = 0
        for key in keywords:
            if activity[key] < min_activity_per_keyword:
              end = True
              break
            else:
                activity_sum += activity[key]

        if not end and activity_sum > min_activity_sum:
            most_active_users.append(user)
      
    if verbose:
        print('Before: ', len(users_activity_counts))
        print('After: ', len(most_active_users))
    return most_active_users

### Save results

In [6]:
import json
import time
from pathlib import Path 

def save_results(users_activity_counts, dset):
    Path("results").mkdir(parents=True, exist_ok=True)
    current_date = time.strftime("%Y%m%d-%H%M%S")

    with open(f'results/users_activity_{current_date}.json', 'w') as f:
        json_object = json.dumps(users_activity_counts, indent=4)
        f.write(json_object)

    dset.to_csv(f'results/{current_date}.csv', index=False, sep=';')

### Example

It may take a while

In [7]:
def do_scrapping(keyword_groups, keywords, min_page, max_page):
    users_activity_counts = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
    datasets = []

    for keyword_group in keyword_groups:
        dataset = get_dataset(keyword_group, 
            min_page=MIN_PAGE, 
            max_page=MAX_PAGE, 
            max_date = datetime(2022, 1, 30), 
            users_activity_counts=users_activity_counts, with_comments=True)
        datasets.append(dataset)

    return datasets, users_activity_counts

keyword_groups = [('lewandowski', '#mecz lewandowski lewandowskiego lewy lewego robercik robercika robert')]
keywords = ['lewandowski']
MIN_PAGE = 1
MAX_PAGE = 5
MIN_ACTIVITY_PER_KEYWORD = 0
MIN_ACTIVITY_SUM = 5

results, users_activity_counts = do_scrapping(keyword_groups, keywords, MIN_PAGE, MAX_PAGE)
accepted_users = get_most_active_users(users_activity_counts, keywords, MIN_ACTIVITY_PER_KEYWORD, MIN_ACTIVITY_SUM)

final_results = []
for result in results:
    result = result.loc[result.author.isin(accepted_users)]
    final_results.append(result)

result_dset = pd.concat(final_results)
result_dset = result_dset.drop_duplicates()
save_results(users_activity_counts, result_dset)

  dataset = pd.DataFrame(transformed_response)
