In [1]:
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import pytesseract
import cv2
import os
from openpyxl import load_workbook # library for excel read and write
import re # regex
import datetime

# Read in image and OCR

In [2]:
def readImage(filename):
    img1 = Image.open(filename)
    return pytesseract.image_to_string(img1)

# Process text

In [3]:
class MovieEntry:
    def __init__(self, movieName, theaterNames):
        self.movieName = movieName
        self.theaterNames = theaterNames
    
    def __str__(self):
        return "< Movie: {} in theater: {}".format(self.movieName, self.theaterNames)

In [4]:
# replace punctuation by regular expression
def processText(string):
    no_punc = re.sub(r'[^\w\s\(\)]','',string)
    paragraphs = no_punc.split("\n\n")
    return paragraphs

In [5]:
def isCapitalizedWord(word):
    return word.isalpha() and word.isupper()


def getNormalMovieEntries(paragraphs, loud=False):
    movieEntrys = []
    hasPrevParagraph = False

    for paragraph in paragraphs:
        words = paragraph.split()
        
        if len(words) == 0:
            continue
        
        isContinuation = not isCapitalizedWord(words[0])
        if loud and isContinuation:
            print("Continue the last paragraph because {} is not capitalized".format(words[0]))
        # it is possible that this paragraph is accidentally separated from the 
        # previous one, whenever the new paragraph doesn't start with a capitalized
        # word (movie name), make it a continuation of the last paragraph
        if not isContinuation:
            isMovieName = True
            movieName = [] # array that contains movie names
            theaterNames = []
            maxNumTime = 0
            numTime = 0
            hasPrevParagraph = True
            justAdded = False
        elif not hasPrevParagraph:
            continue

        for i, word in enumerate(words):

            # store movie name by taking the first few capitalized words
            if isMovieName and isCapitalizedWord(word) and len(word) > 1:
                movieName.append(word)
            # stop taking movie name when seeing the first non capitalized word
            else:
                isMovieName = False

            # Get theater names if it's capitalized
            if not isMovieName and isCapitalizedWord(word) and len(word) > 1:
                # if the next word starts with a (, include it
                if i + 1 < len(words) and words[i+1][0] == "(":
                    theaterNames.append(word + " " + words[i+1])
                else:
                    theaterNames.append(word)

            # judge whether it's a primary showing by
            # the number of time slots it has
            if word.isdigit():
                numTime += 1
            else:
                numTime = 0

            if numTime > maxNumTime:
                maxNumTime = numTime
        
        if loud and len(movieName) > 0:
            print(movieName)
        if loud:
            if len(theaterNames) > 0:
                print(theaterNames)
            else:
                print("No theaters found")
        
        if loud:
            print("Max number of time slots: {}".format(maxNumTime))
        
        # if this is a valid paragraph, and the number of
        # show times is more than 1, then it's a primary showing
        if maxNumTime > 1 and len(movieName) > 0 and len(theaterNames) > 0:
            if isContinuation and justAdded:
                movieEntrys.pop()
            movieEntrys.append(MovieEntry(movieName, theaterNames))
            justAdded = True
        
    return movieEntrys

In [15]:
def testSingleLetter(arr):
    allSingle = True
    for string in arr:
        if len(string) != 1:
            allSingle = False
            break
    return allSingle

def getForeignMovieEntries(paragraphs, loud=False):
    movieEntrys = []
    hasPrevParagraph = False

    for paragraph in paragraphs:
        words = paragraph.split()

        if len(words) == 0:
            continue

        isContinuation = not isCapitalizedWord(words[0])
        if loud and isContinuation:
            print("Continue the last paragraph because {} is not capitalized".format(words[0]))

        # it is possible that this paragraph is accidentally separated from the 
        # previous one, whenever the new paragraph doesn't start with a capitalized
        # word (movie name), make it a continuation of the last paragraph
        if not isContinuation:
            isTheaterName = True
            isTakingMovieName = 0 # 0: before, 1: taking, 2: after
            theaterName = []
            movieName = []
            numTime = 0
            maxNumTime = 0
            hasPrevParagraph = True
            justAdded = False
        elif not hasPrevParagraph:
            continue


        for i, word in enumerate(words):

            # store movie name by taking the first few capitalized words
            if isTheaterName and isCapitalizedWord(word) and len(word) > 1:
                theaterName.append(word)
            # stop taking theater name when seeing the first non capitalized word
            else:
                isTheaterName = False

            # Get movie names that is after the show times
            if not isTheaterName and isTakingMovieName != 2 and maxNumTime > 1 \
                and isCapitalizedWord(word):
                movieName.append(word)
                isTakingMovieName = 1
            elif isTakingMovieName == 1:
                allSingleLetter = testSingleLetter(movieName)
                if allSingleLetter:
                    movieName.clear()
                    isTakingMovieName = 0
                else:
                    isTakingMovieName = 2

            # record the number of time slots it has
            if word.isdigit():
                numTime += 1
            else:
                numTime = 0

            if numTime > maxNumTime:
                maxNumTime = numTime

        if loud:
            if len(movieName) > 0:
                print("Movie: ", movieName)
            else:
                print("No movies found")
        if loud:
            if len(theaterName) > 0:
                print("Theater: ", theaterName)


        if loud:
            print("Max number of time slots: {}".format(maxNumTime))

        # if this is a valid paragraph, and the number of
        # show times is more than 1, then it's a primary showing
        if maxNumTime > 1 and len(movieName) > 0 and len(theaterName) > 0:
            if isContinuation and justAdded:
                movieEntrys.pop()
            movieEntrys.append(MovieEntry(movieName, theaterName))
            justAdded = True  
            
    return movieEntrys

# Read movie names from excel template

In [7]:
# ws: worksheet
def getTheaterInfo(ws):
    providedMovieNames = []
    theaterToRow = {}
    for i, cell in enumerate(ws['A']):
        if i != 0:
            providedMovieNames.append(cell.value)
            theaterToRow[cell.value] = i
    return providedMovieNames, theaterToRow

# Filter valid theater names

In [8]:
# fuzzy search to fill in movie name
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process

def sortByScore(a):
    return a[1]

def filterEntries(movieEntrys, providedTheaterNames, listOfTowns, townThreshold=80, theaterThreshold=85):
    validEntries = []

    for entry in movieEntrys:
        name = " ".join(entry.movieName)
        theaters = []

        for i, theaterName in enumerate(entry.theaterNames):
            # extract the top two closest theater names
            potentials = process.extract(theaterName, providedTheaterNames, limit=2)

            # if the theater has a location within a parentheses
            if "(" in theaterName:
                # get the town name
                town = theaterName[theaterName.index("("):]
                town = town.replace("(", "")
                town = town.replace(")", "")

                # get the theater name without the town
                without_parenth = theaterName[:theaterName.index("(")]
                # get more potential theaters with just the theater name
                potentials.extend(process.extract(without_parenth, providedTheaterNames, limit=2))
                # get the potential towns
                potentialTowns = process.extract(town, listOfTowns)

                # sort both potentials by score
                potentials.sort(key=sortByScore, reverse=True)
                potentialTowns.sort(key=sortByScore, reverse=True)

                # take the highest score
                if potentialTowns[0][1] > townThreshold and potentials[0][1] > theaterThreshold and potentials[0][0] not in theaters:
                    theaters.append(potentials[0][0])

            else:
                potentials.sort(key=sortByScore, reverse=True)

                if potentials[0][1] > theaterThreshold and potentials[0][0] not in theaters:
                    theaters.append(potentials[0][0])

        if len(theaters) > 0:
            newEntry = MovieEntry(name.title(), theaters)
            validEntries.append(newEntry)

    return validEntries

# Fill in Excel

In [9]:
listOfTowns = [
    "Andheri",
    "Bandra",
    "Bassein",
    "Bhandup",
    "Bhivandi",
    "Borivli",
    "Chembur",
    "Colaba",
    "Delisle Road",
    "Ghatkopar",
    "Goregaon",
    "Jogeshwari",
    "Juhu",
    "Kalyan",
    "Kandivli",
    "Kurla",
    "Malad",
    "Matunga",
    "Mazgaon",
    "Mulund",
    "Naroda",
    "Panvel",
    "Parle",
    "Santa Cruz",
    "Thana",
    "Ulhasnagar"
]

In [17]:
def fillInExcelWithEntries(ws, day, validEntries, theaterToRow):
    for entry in validEntries:
        for theater in entry.theaterNames:
            ws.cell(column=day+1, row=theaterToRow[theater]+1, value=entry.movieName)

# All in all

In [18]:
from tqdm import tqdm
def easyFillIn(imageNames, workbookName, writeFileName, normal=True):
    wb = load_workbook(filename=workbookName)
    ws = wb.active # get the first worksheet
    providedTheaterNames, theaterToRow = getTheaterInfo(ws)
    for imageName in tqdm(imageNames):
        day = int(imageName.split("_")[2])
        text = readImage(imageName)
        paragraphs = processText(text)
        if normal:
            rawEntries = getNormalMovieEntries(paragraphs)
        else: # foreign
            rawEntries = getForeignMovieEntries(paragraphs)
        validEntries = filterEntries(rawEntries, providedTheaterNames, listOfTowns)
        fillInExcelWithEntries(ws, day, validEntries, theaterToRow)
    wb.save(filename=writeFileName)

In [12]:
from os import listdir
from os.path import isfile, join
# no foreign movies
imageNames = ["data/" + f for f in listdir("data/") if 'f' in f and '.png' in f]

In [20]:

easyFillIn(imageNames, 
           "test.xlsx", "test2.xlsx", False)

100%|██████████| 28/28 [01:07<00:00,  2.53s/it]
