In [38]:
import fitz
import requests
import cv2
import imutils
import camelot.io as camelot
import pandas as pd
import numpy as np
from collections import Counter

# PDF Extractor

In [39]:
# Extracting words from "dissertation.pdf"

doc = fitz.open('data/dissertation.pdf')
text = "".join(page.get_text("text") for page in doc)
words = pd.Series(text.split())
words.value_counts().head(30)

the         2640
.           2323
of          1580
to          1384
and         1174
a            977
in           694
is           612
for          524
that         507
be           431
data         407
The          361
are          349
with         337
as           307
students     259
can          257
this         247
course       233
it           222
or           221
on           210
an           190
their        182
was          176
by           171
Data         171
not          168
I            155
dtype: int64

In [40]:
# Extracting tabular data from "calendar.pdf"

tables = camelot.read_pdf('data/calendar.pdf')
df = tables[0].df
df

Unnamed: 0,0,1,2,3,4
0,Fall,2020 Fall Term,,2021 Fall Term,
1,First Day of Classes,Tue,September 1,Tue,August 31
2,Labor Day - Classes Will Meet,Mon,September 7,,
3,Labor Day - Classes Suspended,,,Mon,September 6
4,Last day to add or drop courses,Tue,September 15,Tue,September 14
...,...,...,...,...,...
70,Summer Session - 10 week - classes begin,Mon,June 7,Mon,June 6
71,Last day to add or drop courses,Wed,June 16,Wed,June 15
72,Last day to change registration or withdraw fr...,Thur,July 15,Thur,July 14
73,Final Exams,Fri,August 13,Fri,August 12


In [41]:
# Extracting tabular data from "GingerChocolateChipCookies.pdf"
cookie_tables = camelot.read_pdf('data/GingerChocolateChipCookies.pdf')
cookie_df = cookie_tables[0].df
cookie_df

Unnamed: 0,0,1,2,3,4
0,4.0,c,oats,,
1,3.0,c,ﬂour 2.5?,,
2,2.0,t,soda,,
3,1.0,t,salt,,
4,2.0,t,ginger powder,,
5,0.5,t,hot red pepper powder,,
6,,,,,
7,4.0,,eggs,,
8,2.0,c,sugar + 2T,,
9,2.0,T,molasses,,


# Reddit Image Transcriber

In [18]:
# Set a User Agent to avoid being blocked
data = requests.get("https://www.reddit.com/r/comics/.json", headers = {'User-agent': 'your bot 0.1'}).json()
# data

# Face Finding

In [43]:
# Human face finding
# Taken from tutorial at 
# https://opencv-python-tutroals.readthedocs.io/en/latest/py_tutorials/py_objdetect/py_face_detection/py_face_detection.html#face-detection

face_cascade = cv2.CascadeClassifier('xml/haarcascade_frontalface_default.xml')
eye_cascade = cv2.CascadeClassifier('xml/haarcascade_eye.xml')

def face_detect(imgpath, face_cascade, eye_cascade):
    img = cv2.imread(imgpath)
    img = imutils.resize(img, width=800)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    faces = face_cascade.detectMultiScale(gray, 1.3, 5)
    for (x,y,w,h) in faces:
        img = cv2.rectangle(img,(x,y),(x+w,y+h),(255,0,0),2)
        roi_gray = gray[y:y+h, x:x+w]
        roi_color = img[y:y+h, x:x+w]
        eyes = eye_cascade.detectMultiScale(roi_gray)
        for (ex,ey,ew,eh) in eyes:
            cv2.rectangle(roi_color,(ex,ey),(ex+ew,ey+eh),(0,255,0),2)

    cv2.imshow('img', img)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    
face_detect('data/me.jpg', face_cascade, eye_cascade)

In [37]:
# Cat face finding
# Tutorial: https://techtutorialsx.com/2021/04/11/python-opencv-detect-cat-faces/

face_cascade = cv2.CascadeClassifier('xml/haarcascade_frontalcatface.xml')

def cat_face_detect(imgpath, face_cascade):
    img = cv2.imread(imgpath)
    img = imutils.resize(img, width=800)
    
    faces = face_cascade.detectMultiScale(img)
    for (x, y, w, h) in faces:
        cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0))
        
    cv2.imshow('img', img)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    
cat_face_detect('data/cat.jpg', face_cascade)