# Imports

In [43]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import pickle
import os

# Setting up Labels

In [44]:
imgTypeToNum = {
    "0. normal" : 0,
    "1. imemes" : 1,
    "2. ememes" : 1, # same because at the end, both international and egyptian images classify as "memes" label
    "3. text" : 2
}

In [45]:
with open("dataset/imgTypeToNum.pickle", 'wb') as f: # to be later used in another .ipynb file
    pickle.dump(imgTypeToNum, f)

# Scraping Memes

## Scraping from [Kaggle's Meme Generator Dataset](https://www.kaggle.com/datasets/electron0zero/memegenerator-dataset)

### Getting memes' URLs

In [None]:
memesCsvLinks = pd.read_csv("dataset/memegenerator.csv", usecols=["Meme Page URL"]).squeeze() # convert to Series
memesCsvLinks.head()

0           http://memegenerator.net/instance/10509464
1           http://memegenerator.net/instance/12285257
2           http://memegenerator.net/instance/20612245
3    http://webarchive.loc.gov/all/0/http://memegen...
4           http://memegenerator.net/instance/24194267
Name: Meme Page URL, dtype: object

Upon inspection, we'll find that to fetch a page of only the meme image, the URL has to be in this format <br>
`https://memegenerator.net/img/instances/XX.jpg` <br>
Where `XX` is the unique id of the image on [Meme Generator](https://memegenerator.net/) <br>
Thus, the following regex will fetch that id from the `Meme Page URL` column <br>
[Code Source](https://stackoverflow.com/questions/65041520/replace-text-in-a-pandas-dataframe-column-with-regex)

In [None]:
memesIds = memesCsvLinks.str.extract(r'([0-9]{8}$)').dropna()
memesIds.head()

Unnamed: 0,0
0,10509464
1,12285257
2,20612245
3,20614628
4,24194267


In [None]:
pd.set_option('max_colwidth', 70)
memesLinks = 'https://memegenerator.net/img/instances/' + memesIds + '.jpg'
memesLinks.head()

Unnamed: 0,0
0,https://memegenerator.net/img/instances/10509464.jpg
1,https://memegenerator.net/img/instances/12285257.jpg
2,https://memegenerator.net/img/instances/20612245.jpg
3,https://memegenerator.net/img/instances/20614628.jpg
4,https://memegenerator.net/img/instances/24194267.jpg


In [None]:
pd.set_option('max_colwidth', 40)

### Downloading images

In [None]:
def downloadImgs(imgType, ):
    trainSize = int(len(memesLinks) * 0.75)
    valSize = int(len(memesLinks) * 0.15)
    testSize = len(memesLinks) - (trainSize+valSize)
    # to-do: remove lines below and download images to "1. imemes" and "2. ememes"
    for key, value in dic.items():
        path = os.path.join(dataDir, key)
        for imgName in os.listdir(path):
            try:
                img = cv2.imread(os.path.join(path, imgName), cv2.COLOR_BGR2GRAY) 
                imgs.append(img)
                labels.append(value)
            except Exception as e:
                print(e)    
    return (imgs, labels)

In [None]:
def loadData(dataDir):
    imgs = []
    labels = []
    for key, value in dic.items():
        path = os.path.join(dataDir, key)
        for imgName in os.listdir(path):
            try:
                img = cv2.imread(os.path.join(path, imgName), cv2.COLOR_BGR2GRAY) 
                imgs.append(img)
                labels.append(value)
            except Exception as e:
                print(e)    
    return (imgs, labels)