# Parsing мемов

## Загрузка

In [1]:
import requests
import re
import pandas as pd
import time
import sys
import tqdm.notebook

import socks
import socket

from bs4 import BeautifulSoup
from fake_useragent import UserAgent

In [2]:
page_link = 'http://knowyourmeme.com/memes/all/page/1'

In [3]:
response = requests.get(page_link)
response

<Response [403]>

In [4]:
for key, value in response.request.headers.items():
    print(key+": "+value)

User-Agent: python-requests/2.28.1
Accept-Encoding: gzip, deflate, br
Accept: */*
Connection: keep-alive


In [5]:
response = requests.get(page_link, headers={'User-Agent': UserAgent().chrome})
response

<Response [200]>

In [6]:
html = response.content

## Обработка

Передадим функции `BeautifulSoup` текст веб-страницы.

In [7]:
soup = BeautifulSoup(html,'html.parser') 

In [8]:
soup.html.head.title.text

'All Entries | Know Your Meme'

In [9]:
# тег а с атрибутом фото
obj = soup.find('a', attrs = {'class':'photo'}) 
obj

<a class="photo left" href="/memes/padoru" target="_self"><img alt="'Tis the Season for &quot;Padoru&quot;" fetchpriority="low" height="112" src="https://i.kym-cdn.com/featured_items/icons/wide/000/017/730/maxresdefault.jpg" title="You Cannot Escape The Holiday Spirit: Here's A Classic Christmas Meme" width="198"/> <div class="info abs"> <div class="c"> You Cannot Escape The Holiday Spirit: Here's A Classic Christmas Meme </div> </div> </a>

In [10]:
obj = soup.find(lambda tag: tag.name == 'a' and tag.get('class') == ['photo'])
obj

<a class="photo" href="/memes/adarsh-balak-%E0%A4%86%E0%A4%A6%E0%A4%B0%E0%A5%8D%E0%A4%B6-%E0%A4%AC%E0%A4%BE%E0%A4%B2%E0%A4%95"><img alt="Adarsh Balak / आदर्श बालक" data-src="https://i.kym-cdn.com/entries/icons/medium/000/043/020/adarsh.jpg" src="https://s.kym-cdn.com/assets/blank-b3f96f160b75b1b49b426754ba188fe8.gif" title="Adarsh Balak / आदर्श बालक"/> <div class="entry-labels"> <span class="label label-submission"> Submission </span> </div> </a>

In [11]:
obj.attrs['href']

'/memes/adarsh-balak-%E0%A4%86%E0%A4%A6%E0%A4%B0%E0%A5%8D%E0%A4%B6-%E0%A4%AC%E0%A4%BE%E0%A4%B2%E0%A4%95'

In [12]:
print("Тип данных до вытаскивания ссылки:", type(obj))
print("Тип данных после вытаскивания ссылки:", type(obj.attrs['href']))

Тип данных до вытаскивания ссылки: <class 'bs4.element.Tag'>
Тип данных после вытаскивания ссылки: <class 'str'>


In [13]:
meme_links = soup.findAll(lambda tag: tag.name == 'a' and tag.get('class') == ['photo'])
meme_links = [link.attrs['href'] for link in meme_links]
meme_links

['/memes/adarsh-balak-%E0%A4%86%E0%A4%A6%E0%A4%B0%E0%A5%8D%E0%A4%B6-%E0%A4%AC%E0%A4%BE%E0%A4%B2%E0%A4%95',
 '/memes/the-shoe-theory',
 '/memes/she-was-a-x-girl-he-was-a-x-boy',
 '/memes/que-mira-bobo-anda-palla-bobo',
 '/memes/poor-ronaldo-girl',
 '/memes/captain-tsubasa-album-cover-art-parodies',
 '/memes/moore-public-school-teacher-mr-garrison-snapchat-messages',
 '/memes/events/medical-assistance-in-dying-maid',
 '/memes/people/cellbit',
 '/memes/hansel-getting-photographed-riding-scooter-zoolander',
 '/memes/events/emory-nurses-ick-challenge-tiktok-controversy',
 '/memes/subcultures/bluey-tv-series',
 '/memes/events/ludwig-chessboxing-2022',
 '/memes/people/temp6t',
 '/memes/sites/memechat',
 '/memes/touch-of-the-tism']

## Выгрузка

Создадим функцию.

In [14]:
def getPageLinks(page_number):
    """
        Возвращает список ссылок на мемы, полученный с текущей страницы
        
        page_number: int/string
        номер страницы для парсинга.
            
    """
    # составляем ссылку на страницу поиска
    page_link = 'http://knowyourmeme.com/memes/all/page/{}'.format(page_number)
    
    # запрашиваем данные
    response = requests.get(page_link, headers={'User-Agent': UserAgent().chrome})
    
    if not response.ok:
        # если сервер нам отказал, вернем пустой лист для текущей страницы
        return [] 
    
    # получаем содержимое страницы и переводим в суп
    html = response.content
    soup = BeautifulSoup(html,'html.parser')
    
    # ищем ссылки на мемы и очищаем их от ненужных тэгов
    meme_links = soup.findAll(lambda tag: tag.name == 'a' and tag.get('class') == ['photo'])
    meme_links = ['http://knowyourmeme.com' + link.attrs['href'] for link in meme_links]
    
    return meme_links

Протестируем функцию.

In [15]:
meme_links = getPageLinks(1)
meme_links[:2]

['http://knowyourmeme.com/memes/adarsh-balak-%E0%A4%86%E0%A4%A6%E0%A4%B0%E0%A5%8D%E0%A4%B6-%E0%A4%AC%E0%A4%BE%E0%A4%B2%E0%A4%95',
 'http://knowyourmeme.com/memes/the-shoe-theory']

Выгрузка дополнительной информации на примере мема `doge`.

In [16]:
meme_page = 'http://knowyourmeme.com/memes/doge'

response = requests.get(meme_page, headers={'User-Agent': UserAgent().chrome})

html = response.content
soup = BeautifulSoup(html,'html.parser')

Посмотрим, как можно вытащить статистику просмотров, комментариев, а также числа загруженных видео и фото, связанных с нашим мемом.

In [17]:
views = soup.find('dd', attrs={'class':'views'})
views

<dd class="views" title="14,000,576 Views">
<a href="/memes/doge" rel="nofollow">14,000,576</a>
</dd>

In [18]:
views = views.find('a').text
views

'14,000,576'

In [19]:
views = int(views.replace(',', ''))
views

14000576

In [20]:
def getStats(soup, stats):
    """
        Возвращает очищенное число просмотров/коментариев/...
        
        soup: объект bs4.BeautifulSoup 
            суп текущей страницы
            
        stats: string
            views/videos/photos/comments
            
    """
    try:
        obj = soup.find('dd', attrs={'class':stats})
        obj = obj.find('a').text
        obj = int(obj.replace(',', ''))
    except:
        obj=None
    
    return obj

In [21]:
views = getStats(soup, stats='views')
videos = getStats(soup, stats='videos')
photos = getStats(soup, stats='photos')
comments = getStats(soup, stats='comments')

print("Просмотры: {}\nВидео: {}\nФото: {}\nКомментарии: {}".format(views, videos, photos, comments))

Просмотры: 14000576
Видео: 104
Фото: 1792
Комментарии: 924


Достанем дату и время обновления и создания мема.

In [22]:
date = soup.find('abbr', attrs={'class':'timeago'}).attrs['title']
date

'2021-05-25T15:42:58-04:00'

In [23]:
date = soup.find(text='\nAdded\n').parent.find_next().attrs['title']
date

'2013-07-24T16:29:55-04:00'

Общая функция.

In [24]:
def getProperties(soup):
    """
        Возвращает список (tuple) с названием, статусом, типом, 
        годом и местом происхождения и тэгами
        
        soup: объект bs4.BeautifulSoup 
        суп текущей страницы.
    
    """
    # название - идёт с самым большим заголовком h1
    meme_name = soup.find('section', attrs={'class':'info'}).find('h1').text.strip()
    
    # достаём все данные справа от картинки 
    properties = soup.find('aside', attrs={'class':'left'})
    
    # статус 
    meme_status = properties.find("dd")
    meme_status = "" if not meme_status else meme_status.text.strip()
    

    meme_type = properties.find('a', attrs={'class':'entry-type-link'})
    meme_type = "" if not meme_type else meme_type.text 
    

    meme_origin_year = properties.find(text='\nYear\n')
    meme_origin_year = "" if not meme_origin_year else meme_origin_year.parent.find_next()
    meme_origin_year = meme_origin_year.text.strip()
    
    # первоисточник
    meme_origin_place = properties.find('dd', attrs={'class':'entry_origin_link'})
    meme_origin_place = "" if not meme_origin_place else meme_origin_place.text.strip()
    
    # тэги, связанные с мемом
    meme_tags = properties.find('dl', attrs={'id':'entry_tags'}).find('dd')
    meme_tags = "" if not meme_tags else meme_tags.text.strip()
    
    return meme_name, meme_status, meme_type, meme_origin_year, meme_origin_place, meme_tags

In [25]:
getProperties(soup)

('Doge',
 'Confirmed',
 'Animal',
 '2010',
 'Tumblr',
 'animal, dog, shiba inu, shibe, such doge, super shibe, japanese, super, tumblr, much, very, many, comic sans, photoshop meme, such, shiba, shibe doge, doges, dogges, reddit, comic sans ms, tumblr meme, hacked, bitcoin, dogecoin, shitposting, stare, canine')

Текстовое описание мема.

In [26]:
def getText(soup):
    """
        Возвращает текстовые описания мема
        
        soup: объект bs4.BeautifulSoup 
        суп текущей страницы.
            
    """
    
    # достаём все тексты под картинкой
    body = soup.find('section', attrs={'class':'bodycopy'})
    
    # раздел about (если он есть), должен идти первым
    meme_about = body.find('p')
    meme_about = "" if not meme_about else meme_about.text
    
    # раздел origin можно найти после заголовка Origin или History, 

    meme_origin = body.find(text='Origin') or body.find(text='History')
    meme_origin = "" if not meme_origin else meme_origin.parent.find_next().text
    
    # весь остальной текст (если он есть) можно добавить в одно текстовое поле
    if body.text:
        other_text = body.text.strip().split('\n')[4:]
        other_text = " ".join(other_text).strip()
    else:
        other_text = ""
        
    return meme_about, meme_origin, other_text

In [27]:
meme_about, meme_origin, other_text = getText(soup)

print("О чем мем:\n{}\n\nПроисхождение:\n{}\n\nОстальной текст:\n{}...\n"\
      .format(meme_about, meme_origin, other_text[:200]))

О чем мем:
Doge (pronounced /ˈdoʊdʒ/ DOHJ) is a slang term for "dog" that is primarily associated with pictures of Shiba Inus (nicknamed "Shibe") and internal monologue captions on Tumblr. These photos may be photoshopped to change the dog's face or captioned with interior monologues in Comic Sans font. Starting in 2017, Ironic Doge formats gained prevalence over the original wholesome version.

Происхождение:
The use of the misspelled word "doge" to refer to a dog dates back to June 24th, 2005, when it was mentioned in an episode of Homestar Runner's puppet show. In the episode titled "Biz Cas Fri 1"[2], Homestar calls Strong Bad his "d-o-g-e" while trying to distract him from his work.

Остальной текст:
The use of the misspelled word "doge" to refer to a dog dates back to June 24th, 2005, when it was mentioned in an episode of Homestar Runner's puppet show. In the episode titled "Biz Cas Fri 1"[2], H...



Вся информация по текущему мему.

In [28]:
def getMemeData(meme_page):
    """
        Запрашивает данные по странице, возвращает обработанный словарь с данными
        
        meme_page: string
        ссылка на страницу с мемом.
    
    """
    
    # запрашиваем данные по ссылке
    response = requests.get(meme_page, headers={'User-Agent': UserAgent().chrome})
    
    if not response.ok:
        # если сервер нам отказал, вернем статус ошибки 
        return response.status_code
    
    # получаем содержимое страницы и переводим в суп
    html = response.content
    soup = BeautifulSoup(html,'html.parser')

    # используя ранее написанные функции парсим информацию
    views = getStats(soup=soup, stats='views')
    videos = getStats(soup=soup, stats='videos')
    photos = getStats(soup=soup, stats='photos')
    comments = getStats(soup=soup, stats='comments')

    # дата создания
    date = soup.find(text='\nAdded\n').parent.find_next().attrs['title']

    # имя, статус, и т.д.
    meme_name, meme_status, meme_type, meme_origin_year, meme_origin_place, meme_tags =\
    getProperties(soup=soup)

    # текстовые поля
    meme_about, meme_origin, other_text = getText(soup=soup)

    # составляем словарь, в котором будут хранится все полученные и обработанные данные
    data_row = {"name":meme_name, "status":meme_status, 
                "type":meme_type, "origin_year":meme_origin_year, 
                "origin_place":meme_origin_place,
                "date_added":date, "views":views, 
                "videos":videos, "photos":photos, "comments":comments, "tags":meme_tags,
                "about":meme_about, "origin":meme_origin, "other_text":other_text}

    return data_row

In [29]:
data_row = getMemeData('http://knowyourmeme.com/memes/doge')

Подготовка таблицы.

In [30]:
data_row
final_df = pd.DataFrame([data_row])

In [31]:
final_df

Unnamed: 0,name,status,type,origin_year,origin_place,date_added,views,videos,photos,comments,tags,about,origin,other_text
0,Doge,Confirmed,Animal,2010,Tumblr,2013-07-24T16:29:55-04:00,14000576,104,1792,924,"animal, dog, shiba inu, shibe, such doge, supe...",Doge (pronounced /ˈdoʊdʒ/ DOHJ) is a slang ter...,"The use of the misspelled word ""doge"" to refer...","The use of the misspelled word ""doge"" to refer..."


In [32]:
data = pd.DataFrame()
for meme_link in tqdm.notebook.tqdm(meme_links):
    data_row = getMemeData(meme_link)
    final_df = pd.DataFrame([data_row])
    data = pd.concat([data, final_df], ignore_index=True)       
data.head()

  0%|          | 0/16 [00:00<?, ?it/s]

Unnamed: 0,name,status,type,origin_year,origin_place,date_added,views,videos,photos,comments,tags,about,origin,other_text
0,Adarsh Balak / आदर्श बालक,Submission,Parody,2014,Facebook,2022-12-13T12:10:32-05:00,11,0,6,0,"adarsh balak, comic","Adarsh Balak, also known by its Hindi transcri...",,"On May 3rd, 2014, Adarsh Balak's official Face..."
1,The Shoe Theory,Submission,Lip Dub,2022,TikTok,2022-12-13T12:06:30-05:00,9,6,0,0,"shoes bad luck, shoe gift bad luck, bad omen s...",The Shoe Theory is a superstition that claims ...,The idea of shoes being bad luck as gifts has ...,Shoes are considered to be evil in Chinese cul...
2,She Was A X Girl He Was A X Boy,Submission,Exploitable,2022,Twitter,2022-12-13T11:07:04-05:00,98,0,11,0,"she was a x girl, he was a x boy, she was a sp...","She Was A X Girl, He Was A X Boy is an exploit...","On November 22nd, 2022, Twitter[1] account @YT...","Spread On November 28th, 2022, Instagram[3] pa..."
3,"Qué Mira, Bobo? / Anda Palla Bobo",Submission,Viral Video,2022,Twitter,2022-12-13T09:47:55-05:00,173,0,7,0,"twitter, messi, argentina, qatar world cup, ne...","Qué Mira, Bobo? or Anda Palla Bobo refers to a...","On December 10th, 2022, the Argentine National...",“Que miras bobo”DOBLAJE LATINO JAJAJA pic.twit...
4,Poor Ronaldo Girl,Submission,Catchphrase,Unknown,Unknown,2022-12-13T08:59:38-05:00,316,18,0,0,"poor ronaldo, ronaldo, cristiano ronaldo, girl...",The Poor Ronaldo Girl refers to a viral video ...,"On December 11th, 2022, the TikTok[1] account ...","""Portugal airport is that way. And where's Ron..."


## Изменение ip через TOR

Дальнейший код требует работы Tor Browser.

In [33]:
def checkIP():
    ip = requests.get('http://checkip.dyndns.org').content
    soup = BeautifulSoup(ip, 'html.parser')
    print(soup.find('body').text)

In [34]:
socks.set_default_proxy(socks.SOCKS5, "localhost", 9150)
socket.socket = socks.socksocket

In [35]:
# файл torrc
# настройки
# CircuitBuildTimeout 10
# LearnCircuitBuildTimeout 0
# MaxCircuitDirtiness 10

for i in range(5):
    checkIP()
    time.sleep(10)

Current IP Address: 93.95.226.212
Current IP Address: 75.63.67.35
Current IP Address: 185.14.97.176
Current IP Address: 185.220.101.44
Current IP Address: 185.220.102.248


Финальная выгрузка.

In [36]:
data = pd.DataFrame()
PAGE_START = 1
PAGE_END = 10

for page_number in range(PAGE_START, PAGE_END+1):
    print('номер страницы: ', page_number)    
    for i in range(5):
            meme_links = getPageLinks(page_number)  
            if meme_links:
                break
            else:
                time.sleep(20)
        
    for meme_link in tqdm.notebook.tqdm(meme_links):
        for i in range(5):
            try:
                data_row = getMemeData(meme_link)           
                final_df = pd.DataFrame([data_row])
                data = pd.concat([data, final_df], ignore_index=True)  
                break
            except:
                time.sleep(20)
                continue 
    
    data.to_csv('MEMES_{}_{}.csv'.format(PAGE_START, PAGE_END))

номер страницы:  1


  0%|          | 0/16 [00:00<?, ?it/s]

номер страницы:  2


  0%|          | 0/16 [00:00<?, ?it/s]

номер страницы:  3


  0%|          | 0/16 [00:00<?, ?it/s]

номер страницы:  4


  0%|          | 0/16 [00:00<?, ?it/s]

номер страницы:  5


  0%|          | 0/16 [00:00<?, ?it/s]

номер страницы:  6


  0%|          | 0/16 [00:00<?, ?it/s]

номер страницы:  7


  0%|          | 0/16 [00:00<?, ?it/s]

номер страницы:  8


  0%|          | 0/16 [00:00<?, ?it/s]

номер страницы:  9


  0%|          | 0/16 [00:00<?, ?it/s]

номер страницы:  10


  0%|          | 0/16 [00:00<?, ?it/s]

In [37]:
data

Unnamed: 0,name,status,type,origin_year,origin_place,date_added,views,videos,photos,comments,tags,about,origin,other_text,0
0,Adarsh Balak / आदर्श बालक,Submission,Parody,2014,Facebook,2022-12-13T12:10:32-05:00,13.0,0.0,8.0,0.0,"adarsh balak, comic","Adarsh Balak, also known by its Hindi transcri...",,"On May 3rd, 2014, Adarsh Balak's official Face...",
1,The Shoe Theory,Submission,Lip Dub,2022,TikTok,2022-12-13T12:06:30-05:00,9.0,6.0,0.0,0.0,"shoes bad luck, shoe gift bad luck, bad omen s...",The Shoe Theory is a superstition that claims ...,The idea of shoes being bad luck as gifts has ...,Shoes are considered to be evil in Chinese cul...,
2,She Was A X Girl He Was A X Boy,Submission,Exploitable,2022,Twitter,2022-12-13T11:07:04-05:00,98.0,0.0,11.0,0.0,"she was a x girl, he was a x boy, she was a sp...","She Was A X Girl, He Was A X Boy is an exploit...","On November 22nd, 2022, Twitter[1] account @YT...","Spread On November 28th, 2022, Instagram[3] pa...",
3,"Qué Mira, Bobo? / Anda Palla Bobo",Submission,Viral Video,2022,Twitter,2022-12-13T09:47:55-05:00,173.0,0.0,7.0,0.0,"twitter, messi, argentina, qatar world cup, ne...","Qué Mira, Bobo? or Anda Palla Bobo refers to a...","On December 10th, 2022, the Argentine National...",“Que miras bobo”DOBLAJE LATINO JAJAJA pic.twit...,
4,,,,,,,,,,,,,,,403.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,"Gotō-san, You Have A Huge Cock!",Submission,Catchphrase,2022,Twitter,2022-11-30T06:46:29-05:00,633.0,9.0,0.0,1.0,"twiter, japan, vocal impersonation, bocchi the...","""Gotō-san, You Have A Huge Cock!"" (Japanese: 後...","On November 10th, 2022, a female Twitter user,...",ぼっち・ざ・ろっく！のモノマネ pic.twitter.com/pxRMpjcInF — マ...,
156,Tummy Ache,Submission,Catchphrase,2019,Reddit,2022-11-30T06:17:52-05:00,664.0,0.0,33.0,1.0,"tummy ache, tummy hurts, being brave, i'm gonn...",Tummy Ache or Tummy Hurts refers to a series o...,"Prior to December 22nd, 2019, an unknown Reddi...","On April 27th, 2020, Twitter[2] user @Caucasia...",
157,Cristiano Ronaldo's Goal Celebration (Portugal...,Submission,Competition,2022,Twitter,2022-11-29T21:16:32-05:00,1062.0,1.0,3.0,0.0,"cristiano ronaldo, fifa, qatar world cup, brun...",Cristiano Ronaldo Goal Celebration refers to a...,"On November 28nd, 2022, Portugal beat Uruguay ...",Spread Shortly afterward Ronaldo's celebration...,
158,The Character / The Voice,Submission,Catchphrase,2021,Twitter,2022-11-29T16:51:40-05:00,2941.0,0.0,10.0,4.0,"twitter trend, voice actor challenge, characte...",The Character / The Voice is a quote retweet t...,"On December 30th, 2021, Twitter[1] user @Patri...","Spread On November 28th, 2022, Twitter[4] user...",
