In [4]:
import time
import psycopg2
import pandas as pd

In [6]:
conn = psycopg2.connect(database="spezialthema", user="kevin", password="12345678")

In [4]:
def print_full(x):
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 2000)
    pd.set_option('display.float_format', '{:20,.2f}'.format)
    pd.set_option('display.max_colwidth', None)
    print(x)
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.float_format')
    pd.reset_option('display.max_colwidth')

In [4]:
with conn:
    with conn.cursor() as cur:
        cur.execute('select hospital_name from hospitals group by hospital_name HAVING COUNT (DISTINCT city) > 1;')
        print(cur.fetchall())
        
        

[('Blue Cross Clinic',), ('Imagery Health',), ('Open Clinic',), ('Van Holsen Community Hospital',)]


In [19]:
def create():
    with conn:
        with conn.cursor() as cur:
            cur.execute('CREATE TABLE test (A varchar(2), B varchar(2), C varchar(2),D varchar(2))')
            cur.execute('INSERT INTO test(A, B, C, D) values(%s, %s, %s, %s);', ['a1', 'b1', 'c1', 'd2'])
            cur.execute('INSERT INTO test(A, B, C, D) values(%s, %s, %s, %s);', ['a2', 'b2', 'c1', 'd1'])
            cur.execute('INSERT INTO test(A, B, C, D) values(%s, %s, %s, %s);', ['a1', 'b1', 'c2', 'd1'])
            cur.execute('INSERT INTO test(A, B, C, D) values(%s, %s, %s, %s);', ['a3', 'b1', 'c1', 'd1'])

In [22]:
with conn:
    with conn.cursor() as cur:
        cur.execute('DROP TABLE test;')

create()

In [7]:
cur = conn.cursor()

In [2]:
def sql(sql: str):
    return pd.read_sql(sql, conn)

A -> B: Wenn zwei Tupel den gleichen Wert in A haben, müssen sie auch den gleichen Wert in B haben.

In [83]:
sql('select * from test;')

Unnamed: 0,a,b,c,d
0,a1,b1,c1,d2
1,a2,b2,c1,d1
2,a1,b1,c2,d1
3,a3,b1,c1,d1


Es gilt A -> B, aber nicht B -> A.

In [39]:
sql('SELECT * FROM test T1, test T2;')

Unnamed: 0,a,b,c,d,a.1,b.1,c.1,d.1
0,a1,b1,c1,d2,a1,b1,c1,d2
1,a1,b1,c1,d2,a2,b2,c1,d1
2,a1,b1,c1,d2,a1,b1,c2,d1
3,a1,b1,c1,d2,a3,b1,c1,d1
4,a2,b2,c1,d1,a1,b1,c1,d2
5,a2,b2,c1,d1,a2,b2,c1,d1
6,a2,b2,c1,d1,a1,b1,c2,d1
7,a2,b2,c1,d1,a3,b1,c1,d1
8,a1,b1,c2,d1,a1,b1,c1,d2
9,a1,b1,c2,d1,a2,b2,c1,d1


In [8]:
starttime = time.time()
cur.execute('SELECT * FROM test t1, test t2 WHERE t1.a = t2.a AND t1.b <> t2.b')
print(time.time() - starttime)
print(cur.fetchall())
        
cur.execute('SELECT * FROM test t1, test t2 WHERE t1.a = t2.a AND t1.c <> t2.c')
print(cur.fetchall())

0.0026466846466064453
[]
[('a1', 'b1', 'c1', 'd2', 'a1', 'b1', 'c2', 'd1'), ('a1', 'b1', 'c2', 'd1', 'a1', 'b1', 'c1', 'd2')]


In [62]:
print_full(sql('EXPLAIN SELECT * FROM test t1, test t2 WHERE t1.a = t2.a AND t1.b <> t2.b'))

                                                           QUERY PLAN
0                        Hash Join  (cost=1.07..2.18 rows=2 width=24)
1                            Hash Cond: ((t1.a)::text = (t2.a)::text)
2                         Join Filter: ((t1.b)::text <> (t2.b)::text)
3          ->  Seq Scan on test t1  (cost=0.00..1.03 rows=3 width=12)
4                         ->  Hash  (cost=1.03..1.03 rows=3 width=12)
5          ->  Seq Scan on test t2  (cost=0.00..1.03 rows=3 width=12)


In [57]:
starttime = time.time()
cur.execute('SELECT a FROM test GROUP BY a HAVING COUNT(DISTINCT b) > 1;')
print(time.time() - starttime)
print(cur.fetchall())

cur.execute('SELECT a FROM test GROUP BY a HAVING COUNT(DISTINCT c) > 1;')
print(cur.fetchall())

0.000698089599609375
[]
[('a1',)]


In [63]:
print_full(sql('EXPLAIN SELECT a FROM test GROUP BY a HAVING COUNT(DISTINCT b) > 1;'))

                                                       QUERY PLAN
0                GroupAggregate  (cost=1.05..1.10 rows=1 width=3)
1                                                    Group Key: a
2                                 Filter: (count(DISTINCT b) > 1)
3                      ->  Sort  (cost=1.05..1.06 rows=3 width=6)
4                                                     Sort Key: a
5          ->  Seq Scan on test  (cost=0.00..1.03 rows=3 width=6)


In [51]:
starttime = time.time()
cur.execute('''SELECT * FROM test AS t1 WHERE EXISTS ( SELECT * FROM test AS t2 WHERE (t1.a = t2.a AND NOT t2.b = t2.b))''')
print(time.time() - starttime)
print(cur.fetchall())

0.0005586147308349609
[]


In [72]:
sql('''select column_name from information_schema.columns where table_name = 'test';''')

Unnamed: 0,column_name
0,a
1,b
2,c
3,d


In [9]:
def func_dep(tname: str):
    
    cols_df = sql('''select column_name from information_schema.columns where table_name = '{}';'''.format(tname))
    cols = cols_df['column_name'].values.tolist()
    
    print(sql('select * from test;'))
    
    with conn:
        with conn.cursor() as cur:
            for i in cols:
                for j in cols:
                    if i==j:
                        continue            
                    cur.execute('SELECT {} FROM test GROUP BY {} HAVING COUNT(DISTINCT {}) > 1;'.format(i,i,j))
                    res = cur.fetchall()
                    if not res:
                        print('Funktionale Abhängigkeit gefunden: ',i,' -> ',j)
                
func_dep('test')

    a   b   c   d
0  a1  b1  c1  d2
1  a2  b2  c1  d1
2  a1  b1  c2  d1
3  a3  b1  c1  d1
Funktionale Abhängigkeit gefunden:  a  ->  b


In [56]:
def func_dep_unclear(tname: str):
    
    cols_df = sql('''select column_name from information_schema.columns where table_name = '{}';'''.format(tname))
    cols = cols_df['column_name'].values.tolist()
    
    print(sql('select * from test;'))
    
    
    with conn:
        with conn.cursor() as cur:
            for i in cols:
                for j in cols:
                    if i==j:
                        continue            
                    #cur.execute('SELECT {} FROM test GROUP BY {} COUNT(DISTINCT {});'.format(i,i,j))
                    cur.execute('SELECT COUNT({}),COUNT(DISTINCT {}) FROM test GROUP BY {};'.format(i,j,i))
                    res = cur.fetchall()
                    n = 0
                    p = 0
                    for k in range(0,len(res)):
                        n = n + res[k][0]
                        if res[k][1] == 1:
                            p = p+ res[k][0]
                        #    continue
                        #elif res[k][1] == res[k][0]:
                        #    continue
                        #else:
                        #    p = p + res[k][0] - res[k][1] +1
                    p = p/n
                    
                    print(res, i, j)
                    print('Funktionale Abhängigkeit: ',i,' -> ',j ,'zu' , p)
                
func_dep_unclear('test')

    a   b   c   d
0  a1  b1  c1  d2
1  a2  b2  c1  d1
2  a1  b1  c2  d1
3  a3  b1  c1  d1
[(2, 1), (1, 1), (1, 1)] a b
Funktionale Abhängigkeit:  a  ->  b zu 1.0
[(2, 2), (1, 1), (1, 1)] a c
Funktionale Abhängigkeit:  a  ->  c zu 0.5
[(2, 2), (1, 1), (1, 1)] a d
Funktionale Abhängigkeit:  a  ->  d zu 0.5
[(3, 2), (1, 1)] b a
Funktionale Abhängigkeit:  b  ->  a zu 0.25
[(3, 2), (1, 1)] b c
Funktionale Abhängigkeit:  b  ->  c zu 0.25
[(3, 2), (1, 1)] b d
Funktionale Abhängigkeit:  b  ->  d zu 0.25
[(3, 3), (1, 1)] c a
Funktionale Abhängigkeit:  c  ->  a zu 0.25
[(3, 2), (1, 1)] c b
Funktionale Abhängigkeit:  c  ->  b zu 0.25
[(3, 2), (1, 1)] c d
Funktionale Abhängigkeit:  c  ->  d zu 0.25
[(3, 3), (1, 1)] d a
Funktionale Abhängigkeit:  d  ->  a zu 0.25
[(3, 2), (1, 1)] d b
Funktionale Abhängigkeit:  d  ->  b zu 0.25
[(3, 2), (1, 1)] d c
Funktionale Abhängigkeit:  d  ->  c zu 0.25


In [55]:
cur.execute('CREATE STATISTICS s1 (dependencies) ON a,b,c,d FROM test') 
cur.execute('ANALYZE test')   
#print(sql('SELECT dependencies FROM pg_stats_ext').loc[[5]])
cur.execute('SELECT dependencies FROM pg_stats_ext')
print(cur.fetchall())

[('{"1 => 2": 1.000000, "1 => 3": 0.500000, "1 => 4": 0.500000, "2 => 1": 0.250000, "2 => 3": 0.250000, "2 => 4": 0.250000, "3 => 1": 0.250000, "3 => 2": 0.250000, "3 => 4": 0.250000, "4 => 1": 0.250000, "4 => 2": 0.250000, "4 => 3": 0.250000, "1, 2 => 3": 0.500000, "1, 2 => 4": 0.500000, "1, 3 => 2": 1.000000, "1, 3 => 4": 1.000000, "1, 4 => 2": 1.000000, "1, 4 => 3": 1.000000, "2, 3 => 1": 0.500000, "2, 3 => 4": 0.500000, "2, 4 => 1": 0.500000, "2, 4 => 3": 0.500000, "3, 4 => 1": 0.500000, "3, 4 => 2": 0.500000, "1, 2, 3 => 4": 1.000000, "1, 2, 4 => 3": 1.000000, "1, 3, 4 => 2": 1.000000, "2, 3, 4 => 1": 1.000000}',)]


In [82]:
def func_dep(tname: str):
    
    cols_df = sql('''select column_name from information_schema.columns where table_name = '{}';'''.format(tname))
    cols = cols_df['column_name'].values.tolist()
    
    print(sql('select * from test;'))
    
    with conn:
        with conn.cursor() as cur:
            for i in cols:
                for j in cols:
                    if i==j:
                        continue
                        
                    # Prüfe auf funk. Abhängigkeit der Form a -> b
                    cur.execute('SELECT {} FROM test GROUP BY {} HAVING COUNT(DISTINCT {}) > 1;'.format(i,i,j))
                    res = cur.fetchall()
                    if not res:
                        print('Funktionale Abhängigkeit gefunden: ',i,' -> ',j)
                        
                    for k in cols:
                        if i==k or j==k:
                            continue
                            
                        # Prüfe auf funk. Abhängigkeit der Form a -> bc
                        cur.execute('SELECT {} FROM test GROUP BY {} HAVING COUNT(DISTINCT {}) > 1 OR COUNT(DISTINCT {}) > 1;'.format(i,i,j,k))
                        res = cur.fetchall()
                        if not res:
                            print('Funktionale Abhängigkeit gefunden: ',i,' -> ',j,k)
                            
                        # Prüfe auf funk. Abhängigkeit der Form ab -> c
                        cur.execute('SELECT {},{} FROM test GROUP BY {},{} HAVING COUNT(DISTINCT {}) > 1;'.format(i,j,i,j,k))
                        res = cur.fetchall()
                        if not res:
                            print('Funktionale Abhängigkeit gefunden: ',i,j,' -> ',k)
                            
                        #for l in cols:
                        #    if i==l or j==l or k==l:
                        #        continue
                        #    
                        #    # Prüfe auf funk. Abhängigkeit der Form ab -> cd
                        #    cur.execute('SELECT {},{} FROM test GROUP BY {},{} HAVING COUNT(DISTINCT {}) > 1 OR COUNT(DISTINCT {}) > 1;'.format(i,j,i,j,k,l))
                        #    res = cur.fetchall()
                        #    if not res:
                        #        print('Funktionale Abhängigkeit gefunden: ',i,j,' -> ',k,l)
                        
                        
                
func_dep('test')

    a   b   c   d
0  a1  b1  c1  d2
1  a2  b2  c1  d1
2  a1  b1  c2  d1
3  a3  b1  c1  d1
Funktionale Abhängigkeit gefunden:  a  ->  b
Funktionale Abhängigkeit gefunden:  a c  ->  b
Funktionale Abhängigkeit gefunden:  a c  ->  d
Funktionale Abhängigkeit gefunden:  a d  ->  b
Funktionale Abhängigkeit gefunden:  a d  ->  c
Funktionale Abhängigkeit gefunden:  c a  ->  b
Funktionale Abhängigkeit gefunden:  c a  ->  d
Funktionale Abhängigkeit gefunden:  d a  ->  b
Funktionale Abhängigkeit gefunden:  d a  ->  c


In [81]:
# Problem: bei funktionalen Abhängigkeiten a -> b werden nicht alle möglichen Kombinationen geprüft...
def func_dep(tname: str):
    
    cols_df = sql('''select column_name from information_schema.columns where table_name = '{}';'''.format(tname))
    cols = cols_df['column_name'].values.tolist()
    
    print(sql('select * from test;'))
    
    
    with conn:
        with conn.cursor() as cur:
            count = 0
            for i in range(0,len(cols)):
            
                run_sql = True
                for j in range(0 + count,len(cols)):
                    if i==j:
                        continue
                        
                    #print(cols[i],cols[j])  

                    for k in range(0,len(cols)):
                        if i==k:
                            continue

                        if run_sql:    
                            # Prüfe auf funk. Abhängigkeit der Form a -> b
                            cur.execute('SELECT {} FROM test GROUP BY {} HAVING COUNT(DISTINCT {}) > 1;'.format(cols[i],cols[i],cols[k]))
                            res = cur.fetchall()
                            run_sql = False
                            
                            print(cols[i],cols[k])
                            
                            if not res:
                                print('Funktionale Abhängigkeit gefunden: ',cols[i],' -> ',cols[k])
                        
                        if j==k:
                            continue   
                            
                        # Prüfe auf funk. Abhängigkeit der Form ab -> c
                        cur.execute('SELECT {},{} FROM test GROUP BY {},{} HAVING COUNT(DISTINCT {}) > 1;'.format(cols[i],cols[j],cols[i],cols[j],cols[k]))
                        res = cur.fetchall()
                        if not res:
                            print('Funktionale Abhängigkeit gefunden: ',cols[i],cols[j],' -> ',cols[k])
                            
                count += 1

                        
                        
                
func_dep('test')

    a   b   c   d
0  a1  b1  c1  d2
1  a2  b2  c1  d1
2  a1  b1  c2  d1
3  a3  b1  c1  d1
a b
Funktionale Abhängigkeit gefunden:  a  ->  b
Funktionale Abhängigkeit gefunden:  a c  ->  b
Funktionale Abhängigkeit gefunden:  a c  ->  d
Funktionale Abhängigkeit gefunden:  a d  ->  b
Funktionale Abhängigkeit gefunden:  a d  ->  c
b a
c a


In [80]:
# mit einer separaten for schleife funktioniert es
def func_dep(tname: str):
    
    cols_df = sql('''select column_name from information_schema.columns where table_name = '{}';'''.format(tname))
    cols = cols_df['column_name'].values.tolist()
    
    print(sql('select * from test;'))
    
    
    with conn:
        with conn.cursor() as cur:
            count = 0
            for i in range(0,len(cols)):
                for j in range(0,len(cols)):
                    if i==j:
                        continue
                        
                    # Prüfe auf funk. Abhängigkeit der Form a -> b
                    cur.execute('SELECT {} FROM test GROUP BY {} HAVING COUNT(DISTINCT {}) > 1;'.format(cols[i],cols[i],cols[j]))
                    res = cur.fetchall()
                            
                    if not res:
                        print('Funktionale Abhängigkeit gefunden: ',cols[i],' -> ',cols[j])
                        
                for j in range(0 + count,len(cols)):
                    if i==j:
                        continue

                    for k in range(0,len(cols)):
                        if i==k or j==k:
                            continue

                        # Prüfe auf funk. Abhängigkeit der Form ab -> c
                        cur.execute('SELECT {},{} FROM test GROUP BY {},{} HAVING COUNT(DISTINCT {}) > 1;'.format(cols[i],cols[j],cols[i],cols[j],cols[k]))
                        res = cur.fetchall()
                        if not res:
                            print('Funktionale Abhängigkeit gefunden: ',cols[i],cols[j],' -> ',cols[k])
                            
                        # Prüfe auf funk. Abhängigkeit der Form a -> bc
                        cur.execute('SELECT {} FROM test GROUP BY {} HAVING COUNT(DISTINCT {}) > 1 OR COUNT(DISTINCT {}) > 1;'.format(cols[i],cols[i],cols[j],cols[k]))
                        res = cur.fetchall()
                        if not res:
                            print('Funktionale Abhängigkeit gefunden: ',cols[j],' -> ',cols[i],k)
                            
                count += 1

                        
                        
                
func_dep('test')

    a   b   c   d
0  a1  b1  c1  d2
1  a2  b2  c1  d1
2  a1  b1  c2  d1
3  a3  b1  c1  d1
Funktionale Abhängigkeit gefunden:  a  ->  b
Funktionale Abhängigkeit gefunden:  a c  ->  b
Funktionale Abhängigkeit gefunden:  a c  ->  d
Funktionale Abhängigkeit gefunden:  a d  ->  b
Funktionale Abhängigkeit gefunden:  a d  ->  c


In [None]:
# a -> bc

cur.execute('SELECT a FROM test GROUP BY a HAVING COUNT(DISTINCT b) > 1 OR COUNT(DISTINCT c) > 1;')
print(cur.fetchall())

In [None]:
# ab -> c: In diesem Fall muss nur gelten, dass für alle Tupel, die gleiche Werte in a und b
# besitzen, auch die Werte für das Attribut c gleich sind. D.h. wenn nach a und b
# gruppiert wird, muss die Anzahl der verschiedenen Werte für c kleiner oder gleich
# 1 sein. Es gilt wieder, dass das Ergebnis der Anfrage alle Tupel enthält, die die
# Vermutung verletzen. Ist das Ergebnis leer, so gilt ab -> c.

cur.execute('SELECT a,b FROM test GROUP BY a,b HAVING COUNT(DISTINCT c) > 1;')
print(cur.fetchall())

In [None]:
conn.close()

In [None]:
def func_dep_unclear(tname: str):
    
    cols_df = sql(f'''SELECT column_name FROM information_schema.columns WHERE table_name = '{tname}';''')
    cols = cols_df['column_name'].values.tolist()
    
    dep = {}
    
    with conn:
        with conn.cursor() as cur:
            for i in cols:
                for j in cols:
                    if i==j:
                        continue            
                    cur.execute(f'SELECT COUNT({i}),COUNT(DISTINCT {j}) FROM test GROUP BY {i};')
                    res = cur.fetchall()
                    
                    # Berechne unscharfe funktionale Abhängigkeit
                    n = 0
                    p = 0
                    for k in range(0,len(res)):
                        n = n + res[k][0]
                        if res[k][1] == 1:
                            p = p+ res[k][0]
                    p = p/n
                    
                    # print(res, i, j)
                    # print('Funktionale Abhängigkeit: ',i,' -> ',j ,'zu' , p)
                    
                    dep[f'{i} -> {j}'] = p
                    
    return dep
                
func_dep_unclear('test')

In [7]:
def func_dep_unclear_all(tname: str):
    
    cols_df = sql(f'''SELECT column_name FROM information_schema.columns WHERE table_name = '{tname}';''')
    cols = cols_df['column_name'].values.tolist()
    
    dep = {}
    
    with conn:
        with conn.cursor() as cur:
            count = 0
            for i in range(0,len(cols)):
                for j in range(0,len(cols)):
                    if i==j:
                        continue
                        
                    # Prüfe auf funkt. Abhängigkeit der Form A -> B
                    cur.execute(f'SELECT COUNT({cols[i]}),COUNT(DISTINCT {cols[j]}) FROM test GROUP BY {cols[i]};')
                    res = cur.fetchall()
                    
                    # Berechne unscharfe funktionale Abhängigkeit
                    n = 0
                    p = 0
                    for k in range(0,len(res)):
                        n = n + res[k][0]
                        if res[k][1] == 1:
                            p = p+ res[k][0]
                    p = p/n
                    
                    dep[f'{cols[i]} -> {cols[j]}'] = p
                        
                for j in range(0 + count,len(cols)):
                    if i==j:
                        continue

                    for k in range(0,len(cols)):
                        if i==k or j==k:
                            continue

                        # Prüfe auf funkt. Abhängigkeit der Form AB -> C
                        cur.execute(f'SELECT {cols[i]},{cols[j]} FROM test GROUP BY {cols[i]},{cols[j]} HAVING COUNT(DISTINCT {cols[k]}) > 1;')
                        res = cur.fetchall()
                        if not res:
                            dep[f'{cols[i]}, {cols[j]} -> {cols[k]}'] = 'NA'
                            
                        # Prüfe auf funkt. Abhängigkeit der Form A -> BC
                        cur.execute(f'SELECT {cols[i]} FROM test GROUP BY {cols[i]} HAVING COUNT(DISTINCT {cols[j]}) > 1 OR COUNT(DISTINCT {cols[k]}) > 1;')
                        res = cur.fetchall()
                        if not res:
                            dep[f'{cols[j]} -> {cols[i]}, {cols[k]}'] = 'NA'
                            
                count += 1
                
    return dep

                        
                        
                
func_dep_unclear_all('test')

{'a -> b': 1.0,
 'a -> c': 0.5,
 'a -> d': 0.5,
 'a, c -> b': 'NA',
 'a, c -> d': 'NA',
 'a, d -> b': 'NA',
 'a, d -> c': 'NA',
 'b -> a': 0.25,
 'b -> c': 0.25,
 'b -> d': 0.25,
 'c -> a': 0.25,
 'c -> b': 0.25,
 'c -> d': 0.25,
 'd -> a': 0.25,
 'd -> b': 0.25,
 'd -> c': 0.25}

In [None]:
comp = pd.read_csv('../complaints/FLAT_CMPL.txt', sep='\t', header=None, 
                 encoding="latin1", error_bad_lines=False, warn_bad_lines=True)

In [None]:
# Spaltennamen hinzufügen
df_column_labels = pd.read_fwf('../complaints/CMPL.txt', skiprows=30, header=None, 
                               names=['field', 'name', 'type', 'description'])
df_column_labels.dropna(inplace=True)
comp.columns = df_column_labels['name']

In [None]:
comp.shape

In [None]:
comp.head()

In [None]:
import sqlalchemy

In [None]:
engine = sqlalchemy.create_engine('postgresql://lukas:lukas@141.100.70.96:5432/spezialthema')

In [None]:
with engine.connect() as conn:
    comp.to_sql(name='complaint', con=engine)