Install the following necessary packages/libraries.

In [None]:
! pip install tqdm
! pip install pytesseract
! pip install selenium
! pip install beautifulsoup4
! pip install openpyxl
! pip install Pillow

In [None]:
# Common imports
from tqdm.notebook import tqdm
from time import sleep
import openpyxl

# Data Extraction from [Coding Club Image Posts](https://www.linkedin.com/company/coding-club/posts/?feedView=images)

Install ChromeDriver for Selenium to work.<br>
Download the latest version here: https://chromedriver.chromium.org/ <br>
For Windows, download **chromedriver_win32.zip** <br><br>

Install Tesseract engine for Text extraction.<br>
Download the latest version for Windows here: https://github.com/UB-Mannheim/tesseract/wiki <br><br>
Steps involved / Outline of the program:
1. Scrape all image posts using Selenium.
2. Get all valid image links using BeautifulSoup.
3. Store the links in XLSX(Excel) file using Openpyxl.
4. Download all images using URLlib.
5. Process each image for text using PyTesseract.
6. Filter, validate and parse the string into their respective fields
7. Fill the remaining columns of the XLSX file.

In [None]:
workbook_path = 'D:/CodingClubPosts-MiniProject/QB.xlsx'

# Create the workbook to be used.
workbook = openpyxl.Workbook(workbook_path)
workbook.create_sheet('Data')
workbook.save(workbook_path)

Note: If you want to view the workbook while the program is running, make a copy of the file and then open the copy to view. Opening the original file will revoke write access to the program which will raise errors and terminate the execution of the program. 

### Scrape for Image Posts

In [None]:
chromedriver_path = r'D:\Documents\drivers\chromedriver'

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

driver = webdriver.Chrome(chromedriver_path)
driver.get('https://www.linkedin.com/')

Login to Linkedin before proceeding.

In [None]:
driver.get('https://www.linkedin.com/company/coding-club/posts/?feedView=images')

The following snippet scrolls to bottom of the page. This is because LinkedIn renders posts on scroll dynamically.<br>
Run this code again if end is not reached.

In [None]:
for i in tqdm(range(1000)):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

Scrape all posts

In [None]:
from bs4 import BeautifulSoup
src = driver.page_source
soup = BeautifulSoup(src, 'html.parser')
divs = soup.find_all('div', {'class':'occludable-update ember-view'})
print(f'Approximately {len(divs)} posts found.')

Fetch link from image post if it has one of the search_tags and store them in a XLSX file.

In [None]:
search_tags = ['#Asked', '#Google ', '#Coding', 'Coding Interview', '[Difficulty :']

In [None]:
import openpyxl
workbook = openpyxl.load_workbook(workbook_path)
i = 1
for div in tqdm(divs):
    sdiv = div.get_text().lower()
    if any(tag.lower() in sdiv for tag in tags):
        img = div.find('img', {'width':"600",'height':"600"})
        if img!=None:
            workbook['Data'][f'J{i+1}'] = img.get('src')
            workbook.save(workbook_path)
            i+=1
print(f'{i} cells filled.')

It is recommended to logout from LinkedIn before closing the driver.

In [None]:
driver.close()

Download all images

In [None]:
imgs_folder_path = 'D:/CodingClubPosts-MiniProject/imgs/'

In [None]:
import urllib
for i in tqdm(range(1, len(workbook['Data']['J'])+1)):
    file_path = f'{imgs_folder_path}coding{i}.jpg'
    urllib.request.urlretrieve(workbook['Data'][f'J{i+1}'], file_path)

### Extract Data from Images

In [None]:
tesseract_path = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
processed_imgs_folder_path = 'D:/CodingClubPosts-MiniProject/imgs/processed/'
log_filepath = 'D:/CCDE_log.txt'
last_img_index = 908
common_errors = [('Space |','Space:'), ('Input-', 'Input:'), ('Spac ', 'Space'), ('Difficulty level -', 'Difficulty level:'), ('casy', 'easy'), ('(nput', 'Input'), ('.-',':'), ('--', ':'), ('©',':'), ('Qutput','Output'), (';', ':'), ('\\n\\n', '\\n'), (':-',':'), (' :', ':'), (' : ', ':'), (': ', ':'), ('Difficulty level ', 'Difficulty level:')]

Improve image, extract text and save the information.

In [None]:
from PIL import Image, ImageEnhance, ImageOps
from pytesseract import pytesseract
import numpy
import cv2

pytesseract.tesseract_cmd = tesseract_path

# Allowed characters
alp = '1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,>?/\\+=-_;:<>#$*^&%(){}[]@|!'
# conf = '-c tessedit_char_whitelist=1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,>?/\\\'"+=-_:<>#$*^&%(){}[]'
workbook = openpyxl.load_workbook(workbook_path)

# Bounding coordinates to detect type of post.
ca_area = (217, 98, 217+365, 98+97)
# Bounding coordinates for general question.
q_area = (16, 186, 16+767, 186+484)
# Bounding coordinates for Google question.
g_q_area = (14, 226, 14+775, 226+457)

# Improves image for text extraction.
def upscale(img, f=0.5, threshold = 100):
    img = cv2.resize(numpy.array(img), None, fx=f, fy=f, interpolation=cv2.INTER_AREA)
    img = Image.fromarray(img)
    img = img.point(lambda p: p > threshold and 255)
#     img = ImageEnhance.Sharpness(img).enhance(6000)
#     img = ImageEnhance.Contrast(img).enhance(5000)
    return img

# Extracted text parsed into list of field values.
def get_record(text):
    fields = ['Difficulty level:', 'Time:', 'Space:', 'Input:', 'Output:', 'Topic:', 'Tag:']
    ans = ['' for _ in range(len(fields))]
    u, c = 0, -1
    first_arrival = -1
    dl_a = False
    while u<len(text):
        for k, field in enumerate(fields):
            if text[u:].startswith(field):
                if k==0: dl_a = True
                if first_arrival==-1: first_arrival=u
                u+=len(field)
                c=k
        if c!=-1: ans[c]+=text[u]
        u+=1
    a1 = ans[0]
    if dl_a==True:
        a1i = a1.index('\n')
        ans[0] = a1[:a1i].strip().capitalize()
        ans=[a1[a1i+1:]]+ans
    else: ans=[a1[:first_arrival]]+ans
    return list(map(lambda x: x.strip(), ans))

with open(log_filepath, 'w') as log_file:
    log_file.write('This log file contains names of the images which may be irrelevant.\n\n')
    
for i in tqdm(range(1, last_img_index +1)):
    img = ImageOps.grayscale(Image.open(f'{imgs_folder_path}coding{i}.jpg'))
    ca = img.crop(ca_area)
    cai = upscale(ca)
    cai.save(f'{processed_imgs_folder_path}cat{i}.jpg')
    ca_text = pytesseract.image_to_string(cai).lower()
    data_img = None
    complete_text = ''
    if 'google' in ca_text:
        data_img = img.crop(g_q_area)
        workbook['Data'][f'M{i+1}'] = 'Google'
    elif any(tag in ca_text for tag in ['coding', 'interview', 'asked']):
        data_img = img.crop(q_area)
        workbook['Data'][f'M{i+1}'] = 'Ordinary'
    else:
        with open(log_filepath, 'a') as log_file:
            log_file.write(f'coding{i}.jpg, ')
        continue
#         worksheet[ord('M')-ord('A')][i] = 'Ordinary'
    di = upscale(data_img, 0.8, 180)
    complete_text = pytesseract.image_to_string(di)
    for common_error in common_errors:
        complete_text = complete_text.replace(common_error[0], common_error[1])
    di.save(f'{processed_imgs_folder_path}di{i}.jpg')
    aiths = get_record(complete_text)
    for o, aith in enumerate(aiths):
        workbook['Data'][f"{chr(ord('A')+o)}{i+1}"].value=aith
    workbook['Data'][f'L{i+1}'] = repr(complete_text)
    for letter in complete_text:
        if letter not in alp: 
            complete_text = complete_text.replace(letter, ' ')
    workbook['Data'][f'K{i+1}'].value = complete_text
workbook.save(workbook_path)