In [59]:
from IPython import get_ipython
get_ipython().run_line_magic('reset', '-sf') 

import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Numeric,  ForeignKey, text
import glob
import re

In [2]:
def file_extraction(search_string): 
    filepaths = glob.glob(search_string) 
    df = pd.DataFrame()
    for find_files in filepaths:
        param = re.split ('_',find_files)
        add_df = pd.read_excel(find_files, usecols=[2,5])
        add_df ['animal_num'] = param [0][-1]  
        add_df ['exp_group'] = param [3]
        add_df ['cell_num'] =param [2][-1] 
        add_df ['protein'] = param[-1][:-5]
        df = pd.concat ([df, add_df], ignore_index=True)
    df.index.names = ['id'] 
    df.rename(columns={'Volume (unit)':'volume', 'SurfaceArea':'surface_area'}, inplace=True)
    df.index += 1
    return(df)

Grab data and transform it to a Pandas dataframe

In [3]:
data = file_extraction('data/*.xlsx')

First of all, we should create the database. To perform this task, we should connect to postgres

In [None]:
#connection = psycopg2.connect(user = 'postgres', password = '123')
#connection.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)

Then, we should create a coursor that provide us with access to database manipulation. 

In [None]:
#cursor = connection.cursor()

Now, we are able to create database. 

In [None]:
#cursor.execute('CREATE DATABASE epi_data')
#cursor.close()
#connection.close()

Connect to an existing database. It seems that sqlalchemy is needed as an intermediary in order to export a dataframe to a database. We use postgresql+psycopg2, where postgresql is sql dialect and psycopg2 is a intermediary(~driver) for db connection. 

We are connecting to the created database (epi_data) and add a new schema (epilepsy_db)

In [61]:
db_connect = create_engine('postgresql+psycopg2://postgres:123@localhost:5432/epi_data', echo = True)

In [33]:
db_connect.execute('CREATE SCHEMA epilepsy_db')

2022-05-11 16:01:50,947 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2022-05-11 16:01:50,948 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-05-11 16:01:50,949 INFO sqlalchemy.engine.Engine select current_schema()
2022-05-11 16:01:50,950 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-05-11 16:01:50,951 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2022-05-11 16:01:50,951 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-05-11 16:01:50,953 INFO sqlalchemy.engine.Engine CREATE SCHEMA epilepsy_db
2022-05-11 16:01:50,955 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-05-11 16:01:50,957 INFO sqlalchemy.engine.Engine COMMIT


<sqlalchemy.engine.cursor.LegacyCursorResult at 0x23ba307f8e0>

In [34]:
#db_connect.execute('ALTER DATABASE epi_data SET search_path TO epilepsy_db, public')

Now, we can create a tables. I suppose to use the following relations (((add pic)))

In [35]:
metadata = MetaData()

In [36]:
info = Table('info', metadata,
    Column('id', Integer(), primary_key=True),
    Column('animal_num', Integer(), nullable=False, unique=False), 
    Column('exp_group', String(10), nullable=False, unique=False),
    Column('cell_num', Integer(), nullable=False, unique=False), 
    Column('protein', String(10), nullable=False, unique=False),
    schema='epilepsy_db')


In [37]:
gfap = Table ('gfap', metadata, 
    Column('id', Integer(), ForeignKey("epilepsy_db.info.id"), primary_key=True),
    Column('volume', Numeric(), nullable=False),
    Column('surface_area', Numeric(), nullable=False),
    schema='epilepsy_db')
gs = Table ('gs', metadata, 
    Column('id', Integer(),ForeignKey("epilepsy_db.info.id"), primary_key=True),
    Column('volume', Numeric(), nullable=False),
    Column('surface_area', Numeric(), nullable=False),
    schema='epilepsy_db')

glt = Table ('glt', metadata, 
    Column('id', Integer(),ForeignKey("epilepsy_db.info.id"), primary_key=True),
    Column('volume', Numeric(), nullable=False),
    Column('surface_area', Numeric(), nullable=False),
    schema='epilepsy_db')

s100b = Table ('s100b', metadata, 
    Column('id', Integer(),ForeignKey("epilepsy_db.info.id"), primary_key=True),
    Column('volume', Numeric(), nullable=False),
    Column('surface_area', Numeric(), nullable=False),
    schema='epilepsy_db')

cx43= Table ('cx43', metadata, 
    Column('id', Integer(),ForeignKey("epilepsy_db.info.id"), primary_key=True),
    Column('volume', Numeric(), nullable=False),
    Column('surface_area', Numeric(), nullable=False),
    schema='epilepsy_db')

In [38]:
metadata.create_all(db_connect)

2022-05-11 16:01:51,289 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-05-11 16:01:51,290 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where n.nspname=%(schema)s and relname=%(name)s
2022-05-11 16:01:51,291 INFO sqlalchemy.engine.Engine [generated in 0.00071s] {'schema': 'epilepsy_db', 'name': 'info'}
2022-05-11 16:01:51,294 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where n.nspname=%(schema)s and relname=%(name)s
2022-05-11 16:01:51,295 INFO sqlalchemy.engine.Engine [cached since 0.004753s ago] {'schema': 'epilepsy_db', 'name': 'gfap'}
2022-05-11 16:01:51,296 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where n.nspname=%(schema)s and relname=%(name)s
2022-05-11 16:01:51,297 INFO sqlalchemy.engine.Engine [cached since 0.006896s ago] {'schema': 'epilepsy_db', 'name': 'gs'}
2022-05-11 16:01:51,298 INFO s

In [39]:
data.iloc[:, 2:].to_sql('info', db_connect, schema='epilepsy_db', if_exists='append')

2022-05-11 16:01:52,045 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where n.nspname=%(schema)s and relname=%(name)s
2022-05-11 16:01:52,046 INFO sqlalchemy.engine.Engine [cached since 0.7555s ago] {'schema': 'epilepsy_db', 'name': 'info'}
2022-05-11 16:01:52,116 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-05-11 16:01:54,605 INFO sqlalchemy.engine.Engine INSERT INTO epilepsy_db.info (id, animal_num, exp_group, cell_num, protein) VALUES (%(id)s, %(animal_num)s, %(exp_group)s, %(cell_num)s, %(protein)s)
2022-05-11 16:01:54,606 INFO sqlalchemy.engine.Engine [generated in 2.15943s] ({'id': 1, 'animal_num': '1', 'exp_group': 'ctrl', 'cell_num': '1', 'protein': 'Cx43'}, {'id': 2, 'animal_num': '1', 'exp_group': 'ctrl', 'cell_num': '1', 'protein': 'Cx43'}, {'id': 3, 'animal_num': '1', 'exp_group': 'ctrl', 'cell_num': '1', 'protein': 'Cx43'}, {'id': 4, 'animal_num': '1', 'exp_group': 'ctrl', 'cell_num': '1', 'protein': 'Cx43'}

Here, we can choose data about single protein due to choose_protein function. It should be noticed, that in original data protein named as GFAP, GLT, s100b, Cx43, GS. 

In [40]:
def choose_protein (df, find_protein):
    mask = df['protein']==find_protein
    df_out = df.loc[mask]
    return df_out.iloc[:, :2]

In [41]:
choose_protein(data, 'GFAP') .to_sql('gfap', db_connect, schema='epilepsy_db', if_exists='append')

choose_protein(data, 'GS') .to_sql('gs', db_connect, schema='epilepsy_db', if_exists='append')

choose_protein(data, 'GLT') .to_sql('glt', db_connect, schema='epilepsy_db', if_exists='append')

choose_protein(data, 's100b') .to_sql('s100b', db_connect, schema='epilepsy_db', if_exists='append')

choose_protein(data, 'Cx43') .to_sql('cx43', db_connect, schema='epilepsy_db', if_exists='append')


2022-05-11 16:02:02,649 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where n.nspname=%(schema)s and relname=%(name)s
2022-05-11 16:02:02,649 INFO sqlalchemy.engine.Engine [cached since 11.36s ago] {'schema': 'epilepsy_db', 'name': 'gfap'}
2022-05-11 16:02:02,653 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-05-11 16:02:02,762 INFO sqlalchemy.engine.Engine INSERT INTO epilepsy_db.gfap (id, volume, surface_area) VALUES (%(id)s, %(volume)s, %(surface_area)s)
2022-05-11 16:02:02,763 INFO sqlalchemy.engine.Engine [generated in 0.09157s] ({'id': 9607, 'volume': 0.016, 'surface_area': 0.584}, {'id': 9608, 'volume': 0.013, 'surface_area': 0.398}, {'id': 9609, 'volume': 0.003, 'surface_area': 0.146}, {'id': 9610, 'volume': 0.003, 'surface_area': 0.146}, {'id': 9611, 'volume': 0.123, 'surface_area': 2.215}, {'id': 9612, 'volume': 0.003, 'surface_area': 0.146}, {'id': 9613, 'volume': 0.01, 'surface_area': 0.314}, {'id': 9614, 'volu

Lets try to excecute some interesting data from our database. For example, we are interested in data about volume, surface area from GLT sample that belogs to SE group and has number 5

In [42]:
db_connect.execute('ALTER DATABASE epi_data SET search_path TO epilepsy_db, public')

2022-05-11 16:02:14,200 INFO sqlalchemy.engine.Engine ALTER DATABASE epi_data SET search_path TO epilepsy_db, public
2022-05-11 16:02:14,200 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-05-11 16:02:14,201 INFO sqlalchemy.engine.Engine COMMIT


<sqlalchemy.engine.cursor.LegacyCursorResult at 0x23ba52fe070>

Разобраться как добавлять поиск по условию тектовых значениях

In [58]:
db_connect.execute('select volume, surface_area, animal_num, cell_num, protein from gs\
                 join info on gs.id = info.id\
                 where animal_num = 3 and exp_group=SE').fetchall()

2022-05-11 17:02:36,071 INFO sqlalchemy.engine.Engine select volume, surface_area, animal_num, cell_num, protein from gs                 join info on gs.id = info.id                 where animal_num = 3 and exp_group=SE
2022-05-11 17:02:36,072 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-05-11 17:02:36,074 INFO sqlalchemy.engine.Engine ROLLBACK


ProgrammingError: (psycopg2.errors.UndefinedColumn) ОШИБКА:  столбец "se" не существует
LINE 1: ...nfo.id                 where animal_num = 3 and exp_group=SE
                                                                     ^

[SQL: select volume, surface_area, animal_num, cell_num, protein from gs                 join info on gs.id = info.id                 where animal_num = 3 and exp_group=SE]
(Background on this error at: https://sqlalche.me/e/14/f405)

In [70]:
db_connect.execute(text('SELECT volume, surface_area FROM glt JOIN info ON glt.id=info.id WHERE info.exp_group==(SE) and info.animal_num==5')).fetchall()

2022-05-11 17:19:38,561 INFO sqlalchemy.engine.Engine SELECT volume, surface_area FROM glt JOIN info ON glt.id=info.id WHERE info.exp_group==(SE) and info.animal_num==5
2022-05-11 17:19:38,561 INFO sqlalchemy.engine.Engine [generated in 0.00079s] {}
2022-05-11 17:19:38,562 INFO sqlalchemy.engine.Engine ROLLBACK


ProgrammingError: (psycopg2.errors.UndefinedColumn) ОШИБКА:  столбец "se" не существует
LINE 1: ...OIN info ON glt.id=info.id WHERE info.exp_group==(SE) and in...
                                                             ^

[SQL: SELECT volume, surface_area FROM glt JOIN info ON glt.id=info.id WHERE info.exp_group==(SE) and info.animal_num==5]
(Background on this error at: https://sqlalche.me/e/14/f405)