# Beschaffung des Wissens für den Chatbot-Prototyp

Bevor mit dem eigentlichen Chatbot begonnen werden kann, müssen die Inforamtionen auf deren Basis der Chatbot arbeiten soll beschafft und gespeichter werden. Im Rahmen der Arbeit werden dafür die Informationen wie Kontaktdaten und Forschungsschwerpunkte duech Web Scraping extrahiert und in eine sqlite Datenbank gespeichert. 

### Import der Bibliotheken 

In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import time
from tqdm import tqdm

### Beschaffung von Namen der HHN Professoren 

Es werden Namen von HHN Professoren aus Websites gescraped und in eine Liste gespeichter, sodass diese spätter in Iterationen gescraped werden können. 

In [2]:
# get the names of the profs in the html. output is a list eith names
def get_prof_names():
    url = "https://www.meinprof.de/unis/baden-wuerttemberg/hs-heilbronn"
    page = requests.get(url).text
    doc = BeautifulSoup(page, "html.parser")
    
    names = []
    
    for i in range(284):
        a = doc.find_all(class_ = "bold")
        name = a[i].find("a").contents
        names.append(name)
    
    flat_list = [item for sublist in names for item in sublist]
    
    for i in range(len(flat_list)):
        flat_list[i] = str(flat_list[i])
    
    #remove profs with one name
    profs = []
    for i in range(len(flat_list)):
        if "," in flat_list[i]:
            profs.append(flat_list[i])
    return profs

In [3]:
# calling the function
names = get_prof_names()

In [4]:
# change the names so that they can be palced at the end of the url
def transform_name(list):
    prof_name_list = []
    for i in range(len(list)):
        prof_name = list[i]
        prof_name = str(prof_name)
        words = prof_name.split()
        words = words[1], words[0]
        ## joining the words and printing
        prof_name = " ".join(words).strip()
        prof_name = prof_name.replace(",","")
        prof_name = prof_name.replace(" ", ".")
        prof_name_list.append(prof_name)
    return prof_name_list

In [5]:
prof_names = transform_name(names) 

In [6]:
# manually created list with professor names
more_prof_names = ["carsten.lanquillon", "detlef.stern", "jochen.guenther", "helmut.beckmann", "thomas.schaeffer", "sonja.salmen", "ted.azarmi", "katharina.sperl", "rektor", "ulrich.brecht", "ruth.fleuchaus", "raoul.zoellner","christoph.schwerdtfeger"]

In [7]:
# concatenate the big list and the small manual list
all_prof_names=prof_names+more_prof_names
all_prof_names

['Christopher.Adams',
 'Uwe.Ahrens',
 'Tobias.Albrecht',
 'Wolfgang.Albrecht',
 'Martin.Alles',
 'Roland.Alter',
 'Patrick.Balve',
 'Frank.Bantle',
 'Jonathan.Bate',
 'Ralf.Bäuchl',
 'Hans.Bauer',
 'Hans.Bauer',
 'Rainer.Bayer',
 'Helmut.Beckmann',
 'Peter.Bender',
 'Rolf.Bendl',
 'Tomas.Benz',
 'Bleyel.Bernd',
 'Tobias.Bernecker',
 'Thomas.Bezold',
 'Axel.Birk',
 'Lutz.Blecher',
 'Peter.Blessing',
 'Rolf.Blumentritt',
 'Ralf.Bochert',
 'Bernd.Bracht',
 'Bernd.Bracht',
 'Terry.Braemer',
 'David.Braun',
 'Ulrich.Brecht',
 'Birgit.Brigl',
 'Georg.Bucher',
 'Jasmin.Buck',
 'Georg.Bühler',
 'Uwe.Burk',
 'August.Burr',
 'Franzi.Buscholl',
 'Constanze.Clarke',
 'Georg.Clauß',
 'Andreas.Daberkow',
 'Maren-Raphaela.Dechant',
 'Hartmut.Dickhaus',
 'Michael.Dietzel',
 'Ralf.Dillerup',
 'Heiko.Dirks',
 'Juergen.Doneit',
 'Babette.Dorner',
 'Manfred.Dorsch',
 'Michael.Drach',
 'Daniela.Eisele',
 'Wolfgang.Elmendorf',
 'Frank.Engel',
 'Anja.Engelmann',
 'Wolfgang.Ernst',
 'Simon.Fauser',
 'Uwe.Fede

### Funktion für Kontaktdaten der Professoren 

In [8]:
def get_prof_info(list):
    title_list = []
    first_name_list = []
    last_name_list = []
    telephone_list = []
    email_list = []
    office_list = []
    postal_address_list = []
    prof_id_list = []
    
    for prof in list:
        ### get prof_id ###
        prof_id = list.index(prof)+1
        prof_id_list.append(prof_id)
        
        url = "https://www.hs-heilbronn.de/"+prof
        result = requests.get(url).text
        doc = BeautifulSoup(result, "html.parser")
        
        ### get title, first_name, last_name ### 
        full_name = doc.find(class_="mt-0 mb-3 p-0 sal-name").string
        
        # first_name
        first_name = full_name.split()[-2]
        first_name_list.append(first_name)
        
        # last_name
        last_name = full_name.split()[-1]
        last_name_list.append(last_name)
                
        # title
        title = full_name.replace(first_name, "")
        title = title.replace(last_name, "")
        title_list.append(title)
        
        ### get phone number ###
        telephone = doc.find(class_="mb-2").find(class_="color-teal").string
        telephone_list.append(telephone)
        
        ### get omly the first email ###
        email = doc.find_all(class_="mb-2")[1].find(class_="color-teal").string
        email_list.append(email)
        
        ### get office ###
        if "@" in doc.find_all(class_="mb-2")[2].span.string:
        # go to the next item to get the office
            office = doc.find_all(class_="mb-2")[3].span.string
        else:
            office = doc.find_all(class_="mb-2")[2].span.string
        office_list.append(office)
        
        ### get postal_address ###
        if len(doc.find_all(class_="mb-2")[3].span.string) < 10:
            postal_address = doc.find_all(class_="mb-2")[4].span.string
        else:
            postal_address = doc.find_all(class_="mb-2")[3].span.string
        postal_address_list.append(postal_address)
        
    df = pd.DataFrame({'prof_id': prof_id_list, 'title': title_list, 'first_name': first_name_list, 'last_name': last_name_list, 'telephone': telephone_list, 'email': email_list, 'office': office_list, 'postal_address': postal_address_list})

    return df

In [9]:
# execute the function
prof_info_df = get_prof_info(more_prof_names)

In [10]:
prof_info_df

Unnamed: 0,prof_id,title,first_name,last_name,telephone,email,office,postal_address
0,1,Prof. Dr.-Ing.,Carsten,Lanquillon,+49 7131 504 6942,carsten.lanquillon@hs-heilbronn.de,S.3.45 Bildungscampus,"Max-Planck-Str. 39, 74081 Heilbronn"
1,2,Prof. Dr.,Detlef,Stern,+49 7131 504 450,detlef.stern@hs-heilbronn.de,S.3.44 Bildungscampus,"Max-Planck-Str. 39, 74081 Heilbronn"
2,3,Prof. Dr.-Ing.,Jochen,Günther,+49 (0) 7131 504 6945,jochen.guenther@hs-heilbronn.de,S.3.44 Bildungscampus,"Max-Planck-Str. 39, 74081 Heilbronn"
3,4,Prof. Dr.,Helmut,Beckmann,+49 7131 504 514,helmut.beckmann@hs-heilbronn.de,S.3.47 Bildungscampus,"Max-Planck-Str. 39, 74081 Heilbronn"
4,5,Dipl.-Inf.,Thomas,Schäffer,+49 7131 504 6699,thomas.schaeffer@hs-heilbronn.de,S.3.46 Bildungscampus,"Max-Planck-Str. 39, 74081 Heilbronn"
5,6,Prof. Dr.,Sonja-Maria,Salmen,+49 7131 504 477,sonja.salmen@hs-heilbronn.de,S.3.43 Bildungscampus,"Max-Planck-Str. 39, 74081 Heilbronn"
6,7,Prof. Dr.,Ted,Azarmi,+49 7131 504 6697,ted.azarmi@hs-heilbronn.de,N413 Bildungscampus,"Max-Planck-Str. 39, 74081 Heilbronn"
7,8,Dr.,Katharina,Sperl,+49 7131 504 519,katharina.sperl@hs-heilbronn.de,N540 Bildungscampus,"Max-Planck-Str. 39, 74081 Heilbronn"
8,9,Prof. Dr.-Ing.,Oliver,Lenzen,+49 7131 504 201,rektor@hs-heilbronn.de,T.3.26 Bildungscampus,"Max-Planck-Str. 39, 74081 Heilbronn"
9,10,Prof. Dr.,Ulrich,Brecht,+49 7131 504 551,ulrich.brecht@hs-heilbronn.de,T.3.22 Bildungscampus,T.3.22 Bildungscampus


### Funktion für Forschungsschwerpunkte

In [11]:
def get_research_area(list):
    research_area_list = []
    prof_id_list = []
    last_name_list = []
    

    ### get research areas - Fachgebiete ###
    
    for prof in list:
        ran = True
        prof_id = list.index(prof)+1
        
        url = "https://www.hs-heilbronn.de/"+prof
        result = requests.get(url).text
        doc = BeautifulSoup(result, "html.parser")
        parent = doc.find_all(class_="text-left")
        
                # last_name
        full_name = doc.find(class_="mt-0 mb-3 p-0 sal-name").string
        last_name = full_name.split()[-1]
        

        
        for i in range(len(parent)):
            if parent[i].string =="Fachgebiete":
                if parent[i+1].find_all("li") != None:
                    child = parent[i+1].find_all("li")
                    for l in range(len(child)):
                        research_area_list.append(child[l].string)
                        prof_id_list.append(prof_id)
                        last_name_list.append(last_name)
            elif parent[i].string =="Forschungsschwerpunkte" or parent[i].string =="Forschungsgebiete":
                if parent[i+1].find_all("li") != None:
                    child = parent[i+1].find_all("li")
                    for l in range(len(child)):
                        research_area_list.append(child[l].string)
                        prof_id_list.append(prof_id)
                        last_name_list.append(last_name)
            else:
                #if doc.select('ul.vertical-list') != None:
                if ran:
                    for i in range(len(doc.select('ul.vertical-list'))):
                        for l in range(len(doc.select('ul.vertical-list')[i])):
                            research_area_list.append(doc.select('ul.vertical-list')[i].find_all("li")[l].string)
                            prof_id_list.append(prof_id)
                            last_name_list.append(last_name)
                            ran = False


    df = pd.DataFrame({'prof_id': prof_id_list,"Name":last_name_list, 'research_area': research_area_list})           
    #print(len(prof_id_list))
    #print(len(research_area_list))
    
    return df

In [12]:
prof_research_area_df = get_research_area(more_prof_names)

In [13]:
pd.set_option("display.max_rows", None, "display.max_columns", None)
prof_research_area_df

Unnamed: 0,prof_id,Name,research_area
0,1,Lanquillon,Business Intelligence
1,1,Lanquillon,Data Science
2,1,Lanquillon,Machine Learning
3,1,Lanquillon,Künstliche Intelligenz
4,1,Lanquillon,Big Data
5,1,Lanquillon,Business Intelligence
6,1,Lanquillon,Big Data Analytics
7,1,Lanquillon,Data Science
8,1,Lanquillon,Data Mining
9,1,Lanquillon,Text Mining


### Funktion für Studiengänge

In [14]:
def get_study(list):
    study_list = []
    prof_id_list = []
    last_name_list = []
    

    ### get research areas - Fachgebiete ###
    
    for prof in list:
        prof_id = list.index(prof)+1
        
        url = "https://www.hs-heilbronn.de/"+prof
        result = requests.get(url).text
        doc = BeautifulSoup(result, "html.parser")
        parent = doc.find_all(class_="text-left")
        
                # last_name
        full_name = doc.find(class_="mt-0 mb-3 p-0 sal-name").string
        last_name = full_name.split()[-1]
        
        for i in range(len(doc.find_all(class_ = "color-teal"))):
            study = doc.find_all(class_ = "color-teal")[i].string
            if "+" not in study and  "@" not in study:
                study_list.append(study)
                prof_id_list.append(prof_id)
                last_name_list.append(last_name)
        
    df = pd.DataFrame({'prof_id': prof_id_list,"Name":last_name_list, 'study': study_list})           
    #print(len(prof_id_list))
    #print(len(research_area_list))
    
    return df

In [25]:
prof_study_df = get_study(more_prof_names)

In [27]:
prof_study_df

Unnamed: 0,prof_id,Name,study
0,1,Lanquillon,Wirtschaftsinformatik
1,2,Stern,Wirtschaftsinformatik
2,2,Stern,Wirtschaftsinformatik - Informationsmanagement...
3,2,Stern,Wirtschaftsinformatik - Digitale Transformatio...
4,2,Stern,Fakultät Wirtschaft und Verkehr
5,3,Günther,Wirtschaftsinformatik
6,4,Beckmann,Wirtschaftsinformatik - Informationsmanagement...
7,4,Beckmann,Wirtschaftsinformatik
8,5,Schäffer,Wirtschaftsinformatik
9,6,Salmen,Wirtschaftsinformatik


### Dataframes in sqlite Datenbank schreiben 

In [28]:
import sqlite3

In [30]:
# create the tabels and write the dataframe into the tables
conn = sqlite3.connect("PROF_INFO_DB.db")
cur = conn.cursor()
prof_info_df.to_sql("PROF_INFO_TABLE", conn, index=False)
prof_research_area_df.to_sql("PROF_RESEARCH_AREA_TABLE", conn, index=False)
prof_study_df.to_sql("PROF_STUDY_TABLE", conn, index=False)
conn.close()

In [20]:
conn.close()

#### SQL Queries testen

In [37]:
conn = sqlite3.connect("PROF_INFO_DB.db")
cur = conn.cursor()
cur.execute("select research_area FROM PROF_RESEARCH_AREA_TABLE")
conn.commit()
rows = cur.fetchall()
conn.close()

#rows

In [40]:
list = []
for i in range(len(rows)):
    answer = " ".join(rows[i])
    answer.strip()
    list.append(answer)

In [41]:
list

['Business Intelligence',
 'Data Science',
 'Machine Learning',
 'Künstliche Intelligenz',
 'Big Data',
 'Business Intelligence',
 'Big Data Analytics',
 'Data Science',
 'Data Mining',
 'Text Mining',
 'Machine Learning',
 'Natural Language Processing',
 'Knowledge Graphs',
 '    Verteilte / dezentrale Kollaboration',
 '    Agile Methoden',
 'Softwarearchitektur',
 '    Softwaretechnik',
 '    Projektmanagement',
 '    Computer Supported Cooperative Work',
 '    Wirtschaftsinformatik ',
 'Digitale Transformation ',
 'Computer Supported Cooperative Work (CSCW)',
 'Informationsmanagement',
 'Business Intelligence',
 'Betriebliche Informationssysteme',
 'Wissenschafts- und Forschungsmethoden',
 'Daten- und Informationsqualität',
 'Überbetriebliches Stammdatenmanagement',
 'Datenökosysteme',
 'International Business',
 'Master International Business & Intercultural Management',
 'Master International Tourism Management',
 'Konstruktionslehre\xa0',
 'Maschinenelemente\xa0',
 'Festigkeitsle

In [337]:
SearchFor = "title, first_name, last_name"
condition = "Machine Learning"

In [346]:
conn = sqlite3.connect(r"C:\Users\Sebi\OneDrive\Studium\Chatbot_Prototype\PROF_INFO_DB.db")
cur = conn.cursor()
cur.execute("select " + SearchFor + " from PROF_INFO_TABLE inner join PROF_RESEARCH_AREA_TABLE on PROF_RESEARCH_AREA_TABLE.prof_id = PROF_INFO_TABLE.prof_id where research_area = ?", (condition,))
conn.commit()
rows = cur.fetchall()
conn.close()

In [358]:
# convert tupl to string
answer = " ".join(rows[0])

In [359]:
answer

'Prof. Dr.-Ing.   Carsten Lanquillon'

In [360]:
answer_to_user = answer + " is an expert in Machine Learning "

In [361]:
answer_to_user

'Prof. Dr.-Ing.   Carsten Lanquillon is an expert in Machine Learning '

### Testing Code

In [257]:
url = "https://www.hs-heilbronn.de/carsten.lanquillon"
result = requests.get(url).text
doc = BeautifulSoup(result, "html.parser")

In [258]:
doc.prettify()

'<!DOCTYPE html>\n<html lang="de">\n <head>\n  <meta charset="utf-8"/>\n  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>\n  <meta content="width=device-width, initial-scale=1" name="viewport"/>\n  <meta content="Scrivito by Infopark AG (scrivito.com)" name="generator"/>\n  <title data-react-helmet="true">\n   Prof. Dr.-Ing. Carsten Lanquillon\n  </title>\n  <meta content="summary_large_image" data-react-helmet="true" name="twitter:card"/>\n  <meta content="200" data-react-helmet="true" name="prerender-status-code"/>\n  <meta content="article" data-react-helmet="true" property="og:type"/>\n  <meta content="https://www.hs-heilbronn.de/carsten.lanquillon" data-react-helmet="true" property="og:url"/>\n  <meta content="Fachgebiete\nBusiness IntelligenceData ScienceMachine LearningKünstliche IntelligenzBig Data\nForschungsschwerpunkte\nBusiness..." data-react-helmet="true" name="twitter:description"/>\n  <meta content="https://cdn0.scrvt.com/5b9bbd140a15e188780a6244ebe572d4/e0680f718

In [265]:
for i in range(len(doc.find_all(class_ = "color-teal"))):
    study = doc.find_all(class_ = "color-teal")[i].string
    if "+" not in study and  "@" not in study:
        study.list_append(study)

Wirtschaftsinformatik


In [253]:
for i in range(len(doc.find_all(class_ ="h2 color-dark text-left"))):
    if doc.find_all(class_ ="h2 color-dark text-left")[i].string == "Forschungsgebiete":
        print(doc.find_all(class_ ="h2 color-dark text-left")[i].find_all("div"))



[]


In [225]:
#doc.find_all("ul")
doc.find("ul", attrs={"class" :"vertical-list"})

In [226]:
doc.select('ul.vertical-list')[1].find_all("li")[0].string

IndexError: list index out of range

In [188]:
for i in range(len(doc.select('ul.vertical-list'))):
            for l in range(len(doc.select('ul.vertical-list')[i])):
                        print(doc.select('ul.vertical-list')[i].find_all("li")[l].string)

International Business
Master International Business & Intercultural Management
Master International Tourism Management


In [14]:
parent = doc.find_all(class_="text-left")
for i in range(len(parent)):


    if parent[i].string =="Fachgebiete":
         if parent[i+1].find_all("li") != None:
             child = parent[i+1].find_all("li")
               for l in range(len(child)):
                   fg.append(child[l].string)
              #print(fg)                    
    elif parent[i].string =="Forschungsschwerpunkte" or parent[i].string =="Forschungsgebiete": 
        if parent[i+1].find_all("li") != None:
            child = parent[i+1].find_all("li")
            for l in range(len(child)):
                fs.append(child[l].string)
            #print(fg)

        else: 
            child = parent[i+2].find_all("li")
            for l in range(len(child)):
                fs.append(child[l].string)
            #print(fg)

IndentationError: unexpected indent (<ipython-input-14-197319b50a0f>, line 8)