# Imports

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from helium import *

import multiprocessing
from multiprocessing import Process
from pprint import pprint
import regex as re
import pickle
import os
import json
import time
import html

# Global Variables & Functions

In [3]:
os.listdir('dataset/')

['0. normal',
 '1. fmemes',
 '2. ememes',
 '3. eSocialMedia',
 '4. fFbPosts',
 '5. fTwtrPosts',
 '6. fTxtMssgs',
 '7. eGreetingAndMisc',
 'allFbPagesEmemes.pickle',
 'ememesLinksfbPage0.pickle',
 'ememesLinksfbPage1.pickle',
 'ememesLinksfbPage10.pickle',
 'ememesLinksfbPage11.pickle',
 'ememesLinksfbPage2.pickle',
 'ememesLinksfbPage3.pickle',
 'ememesLinksfbPage4.pickle',
 'ememesLinksfbPage5.pickle',
 'ememesLinksfbPage6.pickle',
 'ememesLinksfbPage7.pickle',
 'ememesLinksfbPage8.pickle',
 'ememesLinksfbPage9.pickle',
 'ememesLinksList.pickle',
 'fbPagesUsed.pickle',
 'imgTypeToNum.pickle',
 'memegenerator README.md',
 'memegenerator.csv']

In [9]:
print(re.match('a', 'a'))

<regex.Match object; span=(0, 1), match='a'>


In [6]:
dataDir = 'dataset/'
imgTypes = list(filter(lambda v: re.match('\d{1,3}\. ', v), os.listdir(dataDir)))
imgTypes

['0. normal',
 '1. fmemes',
 '2. ememes',
 '3. eSocialMedia',
 '4. fFbPosts',
 '5. fTwtrPosts',
 '6. fTxtMssgs',
 '7. eGreetingAndMisc']

In [3]:
client_id = 'your-data-here'
client_secret = 'your-data-here'
#to-do: DELETE THESE STRINGS

In [11]:
def pklSave(contentToBeSaved, fullPath):
    with open(fullPath, 'wb') as f:
        pickle.dump(contentToBeSaved, f)

def pklLoad(fullPath):
    with open(fullPath, 'rb') as f:
        contentToBeLoaded = pickle.load(f)
    return contentToBeLoaded

## Setting up images and Labels' dictionaries

In [10]:
imgTypeToNum = {}
for  i, imgType in enumerate(imgTypes):
    imgTypeToNum[imgType] = i
imgTypeToNum

{'0. normal': 0,
 '1. fmemes': 1,
 '2. ememes': 2,
 '3. eSocialMedia': 3,
 '4. fFbPosts': 4,
 '5. fTwtrPosts': 5,
 '6. fTxtMssgs': 6,
 '7. eGreetingAndMisc': 7}

In [7]:
pklSave(imgTypeToNum, dataDir+"imgTypeToNum.pickle") # to be later used in another .ipynb file

# Scraping Foreign Memes

## Scraping from [Kaggle's Meme Generator Dataset](https://www.kaggle.com/datasets/electron0zero/memegenerator-dataset)

### Getting memes' URLs

In [8]:
fmemesCsvLinks = pd.read_csv("dataset/memegenerator.csv", usecols=["Meme ID", "Base Meme Name"]).squeeze() # convert to Series
fmemesCsvLinks.head()

Unnamed: 0,Meme ID,Base Meme Name
0,10509464,Spiderman Approves
1,12285257,Alright Then Business Kid
2,20612245,Archer
3,20614628,Futurama Fry
4,24194267,One Does Not Simply


Upon inspection, we want to get the original template of memes, not its varientswe'll find that to fetch a page of only the meme image, the URL has to be in this format <br>
`https://memegenerator.net/img/instances/XX.jpg` <br>
Where `XX` is the unique id of the image on [Meme Generator](https://memegenerator.net/) <br>
Thus, we will use the `Meme ID` column

In [9]:
fmemesIds = fmemesCsvLinks['Meme ID']
fmemesIds.head()

0    10509464
1    12285257
2    20612245
3    20614628
4    24194267
Name: Meme ID, dtype: object

In [10]:
pd.set_option('max_colwidth', 70)
fmemesLinks = 'https://memegenerator.net/img/instances/' + fmemesIds + '.jpg'
fmemesLinks.head()

0    https://memegenerator.net/img/instances/10509464.jpg
1    https://memegenerator.net/img/instances/12285257.jpg
2    https://memegenerator.net/img/instances/20612245.jpg
3    https://memegenerator.net/img/instances/20614628.jpg
4    https://memegenerator.net/img/instances/24194267.jpg
Name: Meme ID, dtype: object

In [11]:
pd.set_option('max_colwidth', 40)

### Downloading images

In [12]:
import download_imgs
def getAsyncImgFunctions(imgsLinks, imgType, interval, offset=0):
    fns = []
    i = 0
    check = True
    while i < len(imgsLinks):
        fns.append((download_imgs.downloadImgs, (imgType, imgsLinks, i, i+interval, offset)))
        i += interval
    return fns

In [13]:
fns = getAsyncImgFunctions(fmemesLinks, imgTypes[1], 2000)

In [14]:
# Using runInParallel() below, we will run xx functions parallely to fetch data
print(len(fns))

29


In [15]:
def runInParallel(*fns):
  proc = []
  for fn in fns:
    p = Process(target=fn[0], args=(fn[1]))
    p.start()
    proc.append(p)
  for p in proc:
    p.join()

In [16]:
#if __name__ == '__main__':
    #runInParallel(*fns)

## Scraping Egyptian Memes

### From Imgur using this [Facebook post]()

In [17]:
fbPostContent = "Templates HQ - المنتدى ( ألبومات تمبلتس _ مستلزمات الكوميك _ شروحات _ افضل المواقع _ مستلزمات التصميم _ ايموشنز _ وشوش ) : plus.google.com/u/0/communities/108675057016512986137 ____ ألبومات تمبلتس :(تمبلتس | مقصوصه ، افلام ، مسلسلات ، مشاهد ، برامج ) ___________ .......... محمد هنيدى .......... رمضان مبروك ابو العالمين حمودة | imgur.com/a/HxfPF اسماعيلية رايح جى | imgur.com/a/i5Msi فول الصين العظيم | imgur.com/a/C8OLI جائنا البيان التالى | imgur.com/a/r4X66 صاحب صاحبه | imgur.com/a/ilzyz يا انا يا خالتى | imgur.com/a/sTx1q امير البحار | imgur.com/a/mRYs0 وش اجرام | imgur.com/a/lhDIg .......... محمد سعد .......... اللي بالي بالك | imgur.com/a/mMJkA اللمبى‏ | imgur.com/a/kNwD8 بوحه | imgur.com/a/nqdmP .......... عادل امام .......... عريس من جهة امنيه | imgur.com/a/h4pA7 السفارة فى العمارة | imgur.com/a/Y519O سلام يا صاحبى | imgur.com/a/XsucN زهايمر | imgur.com/a/iLUE0 .......... احمد مكى .......... لا تراجع ولا استسلام | imgur.com/a/Ri6QP الكبير اوى | imgur.com/a/fUQNf طير انت | imgur.com/a/NPUrK .......... شيكو و هشام ماجد و أحمد فهمي .......... سمير وشهير وبهير | imgur.com/a/UeRnH حملة فريزر | imgur.com/a/bKB5D بنات العم | imgur.com/a/wMQTL .......... كريم عبد العزيز .......... حرامية في تايلاند‏ | imgur.com/a/MxxS6 فى محطة مصر | imgur.com/a/xxLOL الباشا تلميذ | imgur.com/a/VS6Ui ابو على | imgur.com/a/GdBOz .......... احمد رزق و احمد عيد .......... اوعى وشك | imgur.com/a/tw0Zk فيلم ثقافى‏ | imgur.com/a/AIgjT .......... احمد ادم.......... الرجل الابيض المتوسط | imgur.com/a/OKOME معلش إحنا بنتبهدل | imgur.com/a/VFUyL .......... احمد حلمى .......... عسل اسود | imgur.com/a/mXJkz على جثتى | imgur.com/a/1xzZT كده رضا | imgur.com/a/dxfdi .......... حماده هلال .......... الهرم الرابع | imgur.com/a/wAhJp عيال حبيبة |imgur.com/a/QpP7R غبي منه فيه | imgur.com/a/hYvug مقلب حرامية | imgur.com/a/bbTvu هى فوضى | imgur.com/a/PMlpR تيمور وشفيقه | imgur.com/a/qjcBf طباخ الريس | imgur.com/a/vIzBa كابتن هيما | imgur.com/a/X3abp حلم العمر | imgur.com/a/h1w4F الناظر | imgur.com/a/UQWH7 ايظن | imgur.com/a/G4q1vA2 Euc | imgur.com/a/wM1rrTE ..... تمبلتس باسم يوسف | imgur.com/a/QoM3E تمبلتس افلام اجنبى | imgur.com/a/m2anE ـــــــــــــــــــــــــــــــــــــــ البوم الكيف (مشهد العزاء) | imgur.com/a/ysymZ البوم عايز حقى (مشهد هواء هواء) | imgur.com/a/rBvFC البوم مرتضى (انا معايا دورى وصوير و 4 كاس) | imgur.com/a/Mf9OK البوم لخمة راس (هو انت ياد مش بتتكيف غير لم تضرب)| imgur.com/a/5v0GK البوم ابو العربى ( مشهد الكباريه ) | imgur.com/a/Ga7kGAY البوم الناظر (اوعى يغرك جسمك) | imgur.com/a/hwjFu ألبوم لن اعيش فى جلباب ابى | imgur.com/a/K2bosbc ـــــــــــــــــــــــــــــــــــــــ تمبلتس اعلانات ........ اتصالات محمد رمضان 2017 | imgur.com/a/Pxuv9 فودافون 2017 | imgur.com/a/BaCF8 حديد الجارحى | imgur.com/a/4uLwR كارير | imgur.com/a/pxapm بوادى | imgur.com/a/QdWKQ ـــــــــــــــــــــــــــــــــــــــ تمبلتس مقصوصة | imgur.com/a/y1bDn ــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــ ⛔ كل الألبومات بنعدلها كل يوم وبنضيف عليها كل التمبلتس الجديده | Facebook"
ememesHashes = re.findall("imgur.com/a/([A-Za-z]{5})", fbPostContent)
print(ememesHashes)

['HxfPF', 'ilzyz', 'lhDIg', 'mMJkA', 'nqdmP', 'XsucN', 'fUQNf', 'NPUrK', 'UeRnH', 'wMQTL', 'xxLOL', 'GdBOz', 'AIgjT', 'OKOME', 'VFUyL', 'mXJkz', 'dxfdi', 'wAhJp', 'hYvug', 'bbTvu', 'PMlpR', 'qjcBf', 'vIzBa', 'ysymZ', 'rBvFC', 'hwjFu', 'pxapm', 'QdWKQ']


In [18]:
headers = {
    "Content-Type": "text",
    'Authorization' : "Client-ID " + client_id
}

In [19]:
jsn = json.loads(requests.get('https://api.imgur.com/3/album/i5Msi/images', headers=headers).text)
pprint(jsn)

{'data': [{'account_id': None,
           'account_url': None,
           'ad_type': 0,
           'ad_url': '',
           'animated': False,
           'bandwidth': 149073419,
           'datetime': 1501922268,
           'description': None,
           'edited': '0',
           'favorite': False,
           'has_sound': False,
           'height': 652,
           'id': 'xnRrDTu',
           'in_gallery': False,
           'in_most_viral': False,
           'is_ad': False,
           'link': 'https://i.imgur.com/xnRrDTu.png',
           'nsfw': None,
           'section': None,
           'size': 508783,
           'tags': [],
           'title': None,
           'type': 'image/png',
           'views': 293,
           'vote': None,
           'width': 580},
          {'account_id': None,
           'account_url': None,
           'ad_type': 0,
           'ad_url': '',
           'animated': False,
           'bandwidth': 139489860,
           'datetime': 1501922271,
           'desc

In [20]:
ememesLinks = []
def getImgurLinks():
    for hash in ememesHashes:
        imgsDetailsJson = json.loads(requests.get(f'https://api.imgur.com/3/album/{hash}/images', headers=headers).text)
        for img in imgsDetailsJson['data']:
            ememesLinks.append(img['link'])

In [21]:
#getImgurLinks()

In [22]:
#pklSave(ememesLinks, dataDir+"ememesLinksList.pickle") # store the links, in order not to keep accessing imgur's API each time we run the cells

In [23]:
ememesLinks = pklLoad(dataDir+"ememesLinksList.pickle")

In [24]:
#fns = getAsyncFunctions(ememesLinks, imgTypes[2], len(ememesLinks)//50) #last argument means this: if #links to download are 10,000 then there will be 50 functions, each will take from link i to link i + 10,000/50 (which is 200)  
#if __name__ == '__main__':
    #runInParallel(*fns)

### From Facebook's public meme pages

In [25]:
# browser = start_chrome(headless=False)
# browser.get("https://www.facebook.com/%D9%85%D9%8A%D9%85%D8%B2-%D9%85%D8%B4-%D9%87%D9%8A%D9%81%D9%87%D9%85%D9%87%D8%A7-%D8%A7%D9%84%D9%86%D9%88%D8%B1%D9%85%D9%8A%D8%B2-%D8%B9%D8%B4%D8%A7%D9%86-%D9%86%D9%88%D8%B1%D9%85%D9%8A%D8%B2-833501407023909/")
# time.sleep(1)
# press(PAGE_DOWN)
# press(PAGE_DOWN)
# time.sleep(1)
# try:
#     click('Close')
# except Exception as e:
#     print(e)

In [61]:
def scrollTillEnd(heliumBrowser):
    heliumBrowser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    lenOfPage = heliumBrowser.execute_script("var lenOfPage=document.body.scrollHeight; return lenOfPage;")
    match=False
    while(match==False):
        lastCount = lenOfPage
        time.sleep(3)
        heliumBrowser.execute_script("window.scrollTo(0, document.body.scrollHeight); window.scrollBy(0,-200);")
        lenOfPage = heliumBrowser.execute_script("var lenOfPage=document.body.scrollHeight; return lenOfPage;")
        if lastCount==lenOfPage:
            match=True

In [62]:
def fbPageName(htmlText):
    headerOfPageTitle = re.search('<h[12] .+?(?=/h[12]>)', htmlText)
    if (headerOfPageTitle is None):
        return ""
    headerOfPageTitle = headerOfPageTitle.group() #group() to get the entired matched string out of the regex "match" object
    spanText = re.search('<span>(.*)</span>', headerOfPageTitle)
    if (spanText is not None):
        pageName = spanText.group(1).strip() # group(1) to return what is between parentheses
    else:
        pageName = re.search('>(.*)<', headerOfPageTitle).group(1).strip()
    return pageName

In [63]:
def fbExtractImgsLinks(htmlText):
    try:
        htmlStartingFromAllPhotos = re.findall(r'>(?:All )?[pP]hotos.*', htmlText)[0] # Regex to get pages starting from "All photos" or "photos", but it has to be preceded by ">", in order not to match "photos" in random URLs
    except Exception as e:
        print(e)
        return []
    imgLinks = re.findall(r'https://scontent[^"]+', htmlStartingFromAllPhotos)
    imgLinks = [html.unescape(link) for link in imgLinks][1:] #avoiding first image, as it might be the page's logo
    return imgLinks

def fbExtractImgsLinksOld(heliumBrowser): # didn't use this, as regex is faster than selenium methods 
    imgTags = heliumBrowser.find_elements_by_tag_name('img')
    imgLinks = []
    for link in imgTags:
        if link.get_attribute('src') is not None:
            imgLinks.append(link.get_attribute('src'))
    imgLinks = [html.unescape(link) for link in imgLinks][1:] #avoiding first image, as it might be the page's logo
    return imgLinks

In [64]:
allEmemesLinks = []
prevObtainedPages = {}

In [68]:
def fbPageImgScraper(pagesLinks, heliumBrowser):
    for i in range(len(pagesLinks)):
        heliumBrowser.get(pagesLinks[i])
        pageName = fbPageName(heliumBrowser.page_source)
        if pageName == "":
            print("page not found...")
            continue
        if (pageName in prevObtainedPages):
            print("This page has already been scraped")
            continue
        if i == 0:
            print("Scrolling page:", pageName)
            print("Finished, scraping next link...", end='\n\n')
            continue
        time.sleep(1)
        press(PAGE_DOWN)
        press(PAGE_DOWN)
        time.sleep(1)
        try:
            click('Close')
        except Exception as e:
            print("pop-up didn't  show, moving on...")
        print("Scrolling page:", pageName)
        scrollTillEnd(heliumBrowser)
        imgLinks = fbExtractImgsLinks(heliumBrowser.page_source)
        allEmemesLinks.extend(imgLinks) #extend() expands the elements of an iterable
        pklSave(imgLinks, f"{dataDir}ememesLinksfbPage{i}.pickle")
        prevObtainedPages[pageName] = (i, pagesLinks[i])
        if i == len(pagesLinks)-1:
            print("Finished scraping pages!")
        else:
            print("Finished, scraping next link...", end='\n\n')
    pklSave(allEmemesLinks, f"{dataDir}allFbPagesEmemes.pickle")
    pklSave(prevObtainedPages, f"{dataDir}fbPagesUsed.pickle")

In [69]:
fbLinks = [
    "https://www.facebook.com/%D9%85%D9%8A%D9%85%D8%B2-%D9%84%D8%A7-%D9%81%D8%A7%D8%A6%D8%AF%D8%A9-%D9%85%D9%86%D9%87%D8%A7-%D9%85%D8%AB%D9%84-%D8%AD%D9%8A%D8%A7%D8%AA%D9%83-290489111460684/photos",
    "https://www.facebook.com/MemesYard/photos/?ref=page_internal",
    "https://www.facebook.com/memes.Stolen.1.0/photos/?ref=page_internal",

    "https://www.facebook.com/%D8%A8%D9%86%D8%B3%D8%B1%D9%82-%D9%85%D9%8A%D9%85%D8%B2-%D9%88%D9%83%D9%88%D9%85%D9%8A%D9%83-%D8%B9%D8%B4%D8%A7%D9%86-%D9%85%D8%B4-%D8%A8%D9%86%D8%B9%D8%B1%D9%81-%D9%86%D8%B9%D9%85%D9%84-1735677470065180/photos/?ref=page_internal",
    "https://www.facebook.com/profile.php?id=100064850389099&sk=photos",
    "https://www.facebook.com/%D9%85%D9%8A%D9%85%D8%B2-%D9%84%D9%88%D8%B1%D8%AF-%D9%82%D9%85%D8%AF-100182408618014/photos/?ref=page_internal",
    
    "https://www.facebook.com/%D9%85%D9%8A%D9%85%D8%B2-%D9%85%D8%B5%D8%B1%D9%8A%D9%87-101966151287716/photos/?ref=page_internal",
    "https://www.facebook.com/%D9%85%D9%8A%D9%85%D8%B2-%D9%85%D8%B4-%D9%87%D9%8A%D9%81%D9%87%D9%85%D9%87%D8%A7-%D8%A7%D9%84%D9%86%D9%88%D8%B1%D9%85%D9%8A%D8%B2-%D8%B9%D8%B4%D8%A7%D9%86-%D9%86%D9%88%D8%B1%D9%85%D9%8A%D8%B2-833501407023909/photos/?ref=page_internal",
    "https://www.facebook.com/arabicclassicalartmemes/photos/?ref=page_internal",
    
    "https://web.facebook.com/True.Memes.Comics/photos/?ref=page_internal",
    "https://web.facebook.com/societyforsarcasm/photos/?ref=page_internal",
    "https://web.facebook.com/memes.officil/photos",
]

In [70]:
#browser = start_chrome(headless=False)
#fbPageImgScraper(fbLinks, browser)

Scrolling page: ميمز لا فائدة منها مثل حياتك
Finished, scraping next link...

Scrolling page: ميمز
Finished, scraping next link...

Scrolling page: ميمز مسروقه بس عظمه يسطا
Finished, scraping next link...

pop-up didn't  show, moving on...
Scrolling page: بنسرق ميمز وكوميك عشان مش بنعرف نعمل.
Finished, scraping next link...

pop-up didn't  show, moving on...
Scrolling page: مصنع ميمز
Finished, scraping next link...

Scrolling page: ميمز لورد قمد
Finished, scraping next link...

Scrolling page: ميمز مصريه
Finished, scraping next link...

Scrolling page: ميمز مش هيفهمها النورميز عشان نورميز
Finished, scraping next link...

Scrolling page: كلاسيكال ارت ميمز
Finished, scraping next link...

Scrolling page: ميمز و كوميكس طازة معبرة و ترو
Finished, scraping next link...

Scrolling page: Society Sarcasm
Finished, scraping next link...

pop-up didn't  show, moving on...
Scrolling page: Memes
Finished scraping pages!


In [76]:
#pklSave(allEmemesLinks, f"{dataDir}allFbPagesEmemes.pickle")
#pklSave(prevObtainedPages, f"{dataDir}fbPagesUsed.pickle")

In [78]:
allEmemesLinks = pklLoad(f"{dataDir}allFbPagesEmemes.pickle")
prevObtainedPages = pklLoad(f"{dataDir}fbPagesUsed.pickle")

In [79]:
print(len(allEmemesLinks))
print(len(prevObtainedPages))

15749
12


In [None]:
prevFiles = os.listdir
if len(prevFiles) > 0:
    lastFileName = prevFiles[-1]
    dotLoc = lastFileName.find('.')
    offset = int(lastFileName[dotLoc-7:dotLoc])+1
#fns = getAsyncFunctions(allEmemesLinks, imgTypes[2], len(allEmemesLinks)//50, offset) #2nd to last argument means this: if #links to download are 10,000 then there will be 50 functions, each will take from link i to link i + 10,000/50 (which is 200)  
#if __name__ == '__main__':
#    runInParallel(*fns)

In [None]:
# to-do: 
# document above logic
# change labels and folder structure to the following:
# 3. eSocialMedia, 4. fFbPosts, 5. fTwtrPosts, 6. fTxtMssgs
# scrape links of these new folders; one by one