In [39]:
import os
import re
from string import punctuation
import datetime

import pytesseract
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import cv2
from skimage import io

import datefinder

import spacy 
from spacy import displacy


# from flair.data import Sentence
# from flair.models import SequenceTagger
from segtok.segmenter import split_single

from nltk import ngrams

# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\Beutler\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'

#language models for nlp
#less accurate+english
eng_nlp = spacy.load("en_core_web_sm")
#less accurate +german
de_nlp = spacy.load("de_core_news_sm")




In [40]:
# path = r"C:\Users\fabcr\0A_Jupyter_Lab\DHBW\06_Semester\Seems Dancy\Images_tes\Base"
# img_list=os.listdir(r"C:\Users\fabcr\0A_Jupyter_Lab\DHBW\06_Semester\Seems Dancy\Images_tes\Base")

path = r"C:\Users\Beutler\Desktop\Uni - neu\Semester 6\dancy\NLP\images"
img_list=os.listdir(r"C:\Users\Beutler\Desktop\Uni - neu\Semester 6\dancy\NLP\images")


In [41]:
# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY )

# noise removal
def remove_noise(image):
    return cv2.medianBlur(image,5)
 
#thresholding
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

#dilation
def dilate(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)
    
#erosion
def erode(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.erode(image, kernel, iterations = 1)

#opening - erosion followed by dilation
def opening(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

#canny edge detection
def canny(image):
    return cv2.Canny(image, 100, 200)


##########
# Neu
def torgb(image):
    return cv2.cvtColor((image), cv2.COLOR_BGR2RGB)


In [42]:
def display(image):
    image = torgb(image)
    plt.imshow(image)
    plt.show()


def open_local_img(path, image):
    return cv2.imread(path + '\\' + image)

def open_url(url):
    image = io.imread(url)
    return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)


def image_extract(base, sol='', all=True):
    # Base = base_image, sol = Output String    
#     sol = sol +  'Base:' 
    sol = sol +  pytesseract.image_to_string(base) 

    if all:
#     sol = sol +  'Greyscale: '    
        sol = sol + pytesseract.image_to_string(get_grayscale(base))  
#     sol = sol +  'Smooth: '
        sol = sol + pytesseract.image_to_string(remove_noise(base))
#     sol = sol +  'Dilate: '
        sol = sol + pytesseract.image_to_string(dilate(base))
#     sol = sol +  'Canny Edge: ' 
        sol = sol + pytesseract.image_to_string(canny(base))
    
    #return '' instead of \n
    sol = sol.replace('\n',' ')
    return sol


# Extract ALWAYS returns a list!

def extract_local_batch(batch, path, all=True):

    extracted_text = []

    for image in batch:
        # extract text from image, appending textt found in multiple pre-processing uses into one string
        extracted_text.append(image_extract(open_local_img(path, image), sol='', all=all))

    #return list of extracted texts
    return extracted_text


def extract_url_batch(batch, all=True):

    extracted_text = []

    for url in batch:
        extracted_text.append(image_extract(open_url(url), sol='', all=all))

    return extracted_text

In [43]:
def ner_output(sol, language):
    #german ner
    if language == 'ger':
        return de_nlp(sol)

    #english ner
    #language == 'eng'
    else:
        return eng_nlp(sol)


def get_keywords(text, lang="ger"):
    result = []
    pos_tag = ['PROPN', 'ADJ', 'NOUN'] 
    clean_text = ""
#Determine Stopwords
    if lang == "ger":
        stop_words = de_nlp.Defaults.stop_words
    else: 
        stop_words = eng_nlp.Defaults.stop_words

#Step 1: Remove any elements that are only 1 character long
    raw_text = text.split()

    for word in raw_text:
        if len(word) > 1:
            clean_text = clean_text + word + " "
        
    text = clean_text
    
#Step 2: Tokenize
    doc = ner_output(text.lower(), lang)

#Step 3: Exclude stopwords and punctiation
    for token in doc:
        if(token.text in stop_words or token.text in punctuation):
            continue

#Step 4: Exlude tokens that aren't included in the giving pos tagging  
        if(token.pos_ in pos_tag):
            result.append(token.text)

#Step 4: Return cleaned keywords as a single string
    result = " ".join(result)
                
    return result


In [44]:
# Getting dates from Raw Text
# -> Run dates_weighted on string from extract_batch


# Exclude dates from list before a certain time
def exclude_date(year_list):
    
    date_base = 2020
    date_clear = []

    for date in year_list:
        if int(date) > date_base:
            date_clear.append(date)
    
    if date_clear == []:
        return ["No Year"]
    else:
        return date_clear

#get list of dates from string
def get_dates(text):

    dates = []
    years = []

    matches = datefinder.find_dates(text)

    for match in matches:
        years.append(match.year)
        date = [match.day,match.month]
        dates.append(date)

    return years, dates

# Generate DF that has only unique values and how often they occur
# exclude: exclude years before the one given in the exclude date function (atm hardset on 2020)
# get_all: If True, return the weighted Year and Date df's rather than the string

def dates_weighted(text, exclude=True, get_all = False):
    
    year_list, date_list = get_dates(text)

    if exclude:
        year_list = exclude_date(year_list)

    date_df = pd.DataFrame(data = {'dates': date_list})

    date_df = date_df['dates'].value_counts(dropna=True, sort=True)
    date_df = date_df.rename_axis('dates').reset_index(name='weights')

    year_df = pd.DataFrame(data = {'years': year_list})
    year_df = year_df['years'].value_counts(dropna=True, sort=True)
    year_df = year_df.rename_axis('years').reset_index(name='weights')

    if get_all:
        return date_df, year_df
    else:

        l_date = date_df["dates"].iloc[0]
        l_year = year_df["years"].iloc[0]

        date_str = f"{l_date[0]}.{l_date[1]}.{l_year}"

        return date_str





In [45]:
#Find website adresses in text, expected input = keywords

def get_web_adresses(text):

    adresses = []
    #uses re library to find string starting with www.
    adresses = re.findall('www.([\w\-\.]+)',  text)

    #Return "none" if no adresses were found
    if adresses == []:
        return "None"
    else:
        #remove duplicates
        adresses = list(set(adresses))

        #addd www. to the adresses again, re removes them while saving
        for x in range(len(adresses)):
            adresses[x] = 'www.'+adresses[x]

        return adresses    



___________________

In [46]:
###Currently Unused

def get_trigramms(input):
    n_gramm_output = []

    gram_list = ngrams(input.split(), 3)

    for grams in gram_list:
        n_gramm_output.append(str(grams[0]+ ' ' +grams[1]+ ' ' + grams[2]))

    return n_gramm_output


def get_trigramm_batch(inputbatch):
    n_gramm_ouput = []
    for i in range(len(inputbatch)):
        n_gramm_ouput.append(get_trigramms(inputbatch[i]))

    return n_gramm_ouput

In [47]:
#Debug-Display Functions:
def image_test(path, image_path, sol):
    # Path = Speicherpfad, image_path = Dateiname, sol = Output String
    base = open_local_img(path,image_path)
        
    display(get_grayscale(base))
    sol = sol + pytesseract.image_to_string(get_grayscale(base))
    print('_______________________')
    
    display(remove_noise(base))
    sol = sol + pytesseract.image_to_string(remove_noise(base))
    print('_______________________')
    
    display(dilate(base))
    sol = sol + pytesseract.image_to_string(dilate(base))
    print('_______________________')
    
    display(erode(base))
    sol = sol + pytesseract.image_to_string(erode(base))
    print('_______________________')
    
    display(opening(base))
    sol = sol + pytesseract.image_to_string(opening(base))
    print('_______________________')
    
    display(canny(rb_base))
    sol = sol + pytesseract.image_to_string(canny(base))
    print('_______________________')
    return sol


def ner_display(sol, language="eng"):
    
    #german ner
    if language == 'ger':
        token_sol = de_nlp(sol)
        for word in token_sol.ents:
            print('Text:', word.ents, '\tLabel:', word.label_) 
        displacy.render(token_sol,style="ent",jupyter=True)

    #english ner
    #language == 'eng'
    else:
        token_sol = eng_nlp(sol)
        for word in token_sol.ents:
            print('Text:', word.ents, '\tLabel:', word.label_) 
        displacy.render(token_sol,style="ent",jupyter=True)


In [48]:
test_batch = extract_local_batch(img_list, path)
   

In [49]:
print(test_batch)

["NOV, 2 KANN NIGHT STATTFINDEN  NOVA ROGK 2021 CAN'T TAKE PLAGE  09.-12. JUNE  MORE INFO COMING SOON! —) bo SON OCK 2021 WA Ss KANN NICHT STATTFINDEN / L.  NOVA ROGK 2021 CAN'T TAKE PLAGE  09,-12. JUNE RN ee 4 SS WA ROCK 2021 CAN T TAKE PLACE Z \\ Zs,  09.-12. JUNE  ~ ETT” 4!  ya ha i Ali Ai  nova (7) z rock | S \\ NOVA ROCK 2021  SNS \\ KANN NICHT STATTFINDEN Af  <  Sew ome 09.-12. JUNE  MORE INFO COMING SOON!  4 a Ps , &  = | BOUEeTErteney | KCNA UICH SOTTO  (UNE [ev Uo Ie m jp cil en comman lh» <  oa  ", 'BENNY AND ALYSSA’S  LIVE MUSIC + FOOD + NOODLES  123 WESTDALE CRESCENT, SAN FRANCISCO CA SUMMER LOVE  POOL ONAN  BEMMY AND ALYSSAS  LIVE MUSIK « FOOO + MOOOL ES  123 WESTOALE CRESCENT, SAN PRAMCISCO CA BENNY AND ALYSSA’S  SUM IMEI IONE!  PARTY  WEVEGOT J a  na ISIC © ROOD + NOOBLES  o fo ee a ge JULY 12, 2O2O 3a 3, - 123 WESTDALE CRESCENT, SAN FRANCISEO CA ° : . ', 'DONNERSTAG 16. JUNI 2022  THE HIV ES AIRBOURNE  STEEL PANTHER _—_ THUNDER 17. JUNI 2022  ia! —  Sa 2 ae  ‘TRANSPORTP

In [50]:
key_words = []

for x in range(len((test_batch))):
    key_words.append(get_keywords(test_batch[x]))

print(key_words)

["nov night nova can't plage 09.-12. june more info coming soon son ock wa ss l. nova can't plage june ee ss wa rock can take june ett ai nova rock nova rock sns ome 09.-12. june more info coming soon ps boueeterteney kcna sotto une comman lh", 'benny and alyssa ’s music noodles westdale crescent san francisco ca onan bemmy and alyssas live musik fooo moool westoale crescent san pramcisco ca benny and alyssa ’s sum imei ione party wevegot isic rood noobles ee ge july westdale crescent san', 'donnerstag 16. juni airbourne panther thunder 17. juni transportpartner migros prasentiert donnerstag 16. juni airbourne panther thunder ema foreigner bush jn monster megawatt alice cooper black label society accept storace dirty hinwil more soon autobahnkreisel hauptpartner medienpartner heineken cco-partner www.rockthering.ch wwwsticketcorner.ch arieettieet ce rey panther thunder dunner stag cal leas the hives airbourne panther thunder bush ee monster megawatt alice cooper black label society acc

In [51]:

for x in range(len(key_words)):
    print(key_words[x])
    print('_______________')
    print(get_dates(key_words[x]))
    print('_______________')

    try:
        print(dates_weighted(key_words[x]))
    except:
        print("None")
    print('_______________')

nov night nova can't plage 09.-12. june more info coming soon son ock wa ss l. nova can't plage june ee ss wa rock can take june ett ai nova rock nova rock sns ome 09.-12. june more info coming soon ps boueeterteney kcna sotto une comman lh
_______________
([2009, 2022, 2022, 2009], [[12, 6], [14, 6], [14, 6], [12, 6]])
_______________
12.6.2022
_______________
benny and alyssa ’s music noodles westdale crescent san francisco ca onan bemmy and alyssas live musik fooo moool westoale crescent san pramcisco ca benny and alyssa ’s sum imei ione party wevegot isic rood noobles ee ge july westdale crescent san
_______________
([2022], [[14, 7]])
_______________
14.7.2022
_______________
donnerstag 16. juni airbourne panther thunder 17. juni transportpartner migros prasentiert donnerstag 16. juni airbourne panther thunder ema foreigner bush jn monster megawatt alice cooper black label society accept storace dirty hinwil more soon autobahnkreisel hauptpartner medienpartner heineken cco-partner

In [52]:
for element in key_words:
    print(get_web_adresses(element))
    


None
None
['www.ticketcorner.ch', 'www.rockthering.ch']
['www.xythinkzclub.com']
None


In [65]:
# for x in range(len(test_batch)):
#     ner_display(test_batch[x], 'eng')
#     print('__________')

# for x in range(len(key_words)):
#     ner_display(key_words[x].title(), 'eng')
#     print('__________')    


# excepting list
def get_artist(input):
    
    token_sol = eng_nlp(input)

    entities = []
    
    for word in token_sol.ents:
        if word.label_ in ["ORG", "PERSON"]:
            if str(word.ents[0]) not in entities:
                entities.append(str(word.ents[0]))

    return entities

# for x in range(len(test_batch)):
#     get_artist(test_batch[x])
#     get_artist(key_words[x].upper())
#     get_artist(key_words[x].title())
#     print('__________')

In [54]:
# for x in range(len(test_batch)):
#     ner_display(test_batch[x], 'ger')
#     print('__________')

In [55]:
# text = 'Im Juli gehe ich in die Schule neben der Maimarkthalle in MANNHEIM .'
# bsp = 'SA.UC.1E. ie ee IMARKTHALLE MANNHEIM , we UICTLY: HARDCORE - HARDSTYLE- HARDTECHNO C012 sh 0242-1 MARKTHALLE MANNHEIM \ 20 -OGURR gilZoealld SN. Hh eae NAIDOO > SN ABN canes newer a ee ARO BORE - HARDSTYLE-HARDVECHIO a 4fm ~ og AE . oe ~ Wy NX { Lo Ane By a <7 eee ood Ni wa are 2 é i , a g a8, '
# #

# ner_display(bsp, "ger")
# ner_display(bsp, "eng")
# ner_display(text, "ger")
# ner_display(text, "eng")

_______________

In [56]:
trigramlist = get_trigramm_batch(test_batch)

print(trigramlist[0])

['NOV, 2 KANN', '2 KANN NIGHT', 'KANN NIGHT STATTFINDEN', 'NIGHT STATTFINDEN NOVA', 'STATTFINDEN NOVA ROGK', 'NOVA ROGK 2021', "ROGK 2021 CAN'T", "2021 CAN'T TAKE", "CAN'T TAKE PLAGE", 'TAKE PLAGE 09.-12.', 'PLAGE 09.-12. JUNE', '09.-12. JUNE MORE', 'JUNE MORE INFO', 'MORE INFO COMING', 'INFO COMING SOON!', 'COMING SOON! —)', 'SOON! —) bo', '—) bo SON', 'bo SON OCK', 'SON OCK 2021', 'OCK 2021 WA', '2021 WA Ss', 'WA Ss KANN', 'Ss KANN NICHT', 'KANN NICHT STATTFINDEN', 'NICHT STATTFINDEN /', 'STATTFINDEN / L.', '/ L. NOVA', 'L. NOVA ROGK', 'NOVA ROGK 2021', "ROGK 2021 CAN'T", "2021 CAN'T TAKE", "CAN'T TAKE PLAGE", 'TAKE PLAGE 09,-12.', 'PLAGE 09,-12. JUNE', '09,-12. JUNE RN', 'JUNE RN ee', 'RN ee 4', 'ee 4 SS', '4 SS WA', 'SS WA ROCK', 'WA ROCK 2021', 'ROCK 2021 CAN', '2021 CAN T', 'CAN T TAKE', 'T TAKE PLACE', 'TAKE PLACE Z', 'PLACE Z \\', 'Z \\ Zs,', '\\ Zs, 09.-12.', 'Zs, 09.-12. JUNE', '09.-12. JUNE ~', 'JUNE ~ ETT”', '~ ETT” 4!', 'ETT” 4! ya', '4! ya ha', 'ya ha i', 'ha i Ali', 'i A

In [57]:
#Test URL Extraction
urls = ["https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSuPmO_5fHMKY2NOczsNDO3pDQH4YLfMeXFiQ&usqp=CAU"]

print(extract_url_batch(urls))

['a 3 JEDER LONGDRINK NOR 5€ FREITAG 4 792071 ‘EXCESS BAR G LOUNGE, 5 a, % ; JEDEB LONGDRINK NOR SE FREITAG 4: a EXCESS BAR LOONGE ']


In [66]:
    
def process_url(input, all=True):

    url_dict = {"URL": input}
    
    url_dict["raw_text"] = input
    #url_dict["raw_text"] = image_extract(open_url(url), sol='', all=all)
    url_dict["key_words"] = get_keywords(url_dict["raw_text"])


    try:
        url_dict["likely_date"] = dates_weighted(url_dict["key_words"])
    except:
        url_dict["likely_date"] = "No Date Found"

    url_dict["websites"] = get_web_adresses(url_dict["key_words"])

    url_dict["features"] = get_artist(url_dict["key_words"].title())

    return url_dict


for entry in range(len(test_batch)):
    print(process_url(test_batch[entry]))
    print("_______________")

{'URL': "NOV, 2 KANN NIGHT STATTFINDEN  NOVA ROGK 2021 CAN'T TAKE PLAGE  09.-12. JUNE  MORE INFO COMING SOON! —) bo SON OCK 2021 WA Ss KANN NICHT STATTFINDEN / L.  NOVA ROGK 2021 CAN'T TAKE PLAGE  09,-12. JUNE RN ee 4 SS WA ROCK 2021 CAN T TAKE PLACE Z \\ Zs,  09.-12. JUNE  ~ ETT” 4!  ya ha i Ali Ai  nova (7) z rock | S \\ NOVA ROCK 2021  SNS \\ KANN NICHT STATTFINDEN Af  <  Sew ome 09.-12. JUNE  MORE INFO COMING SOON!  4 a Ps , &  = | BOUEeTErteney | KCNA UICH SOTTO  (UNE [ev Uo Ie m jp cil en comman lh» <  oa  ", 'raw_text': "NOV, 2 KANN NIGHT STATTFINDEN  NOVA ROGK 2021 CAN'T TAKE PLAGE  09.-12. JUNE  MORE INFO COMING SOON! —) bo SON OCK 2021 WA Ss KANN NICHT STATTFINDEN / L.  NOVA ROGK 2021 CAN'T TAKE PLAGE  09,-12. JUNE RN ee 4 SS WA ROCK 2021 CAN T TAKE PLACE Z \\ Zs,  09.-12. JUNE  ~ ETT” 4!  ya ha i Ali Ai  nova (7) z rock | S \\ NOVA ROCK 2021  SNS \\ KANN NICHT STATTFINDEN Af  <  Sew ome 09.-12. JUNE  MORE INFO COMING SOON!  4 a Ps , &  = | BOUEeTErteney | KCNA UICH SOTTO  (U