#### Importing Libs

In [1]:
import sqlalchemy as sql
import pandas as pd
from decouple import config
import glob
import re
from slugify import slugify
import numpy as np
DATABASE_EXERCISE = config('DATABASE_EXERCISE')
USER_EXERCISE = config('USER_EXERCISE')
PWD_EXERCISE = config('PWD_EXERCISE')
TABLES_PATH = config('TABLES_PATH')
PROCESSED_DATA_PATH = config('PROCESSED_DATA_PATH')

#### Class SQLeto

In [2]:
class SQLeto : 

    def __init__(
        self, database:str, 
        user:str, password:str) -> None:

        self.database = database
        self.user = user
        self.pwd = password
    
    def create_engine(
        self)->sql.engine.Engine :

        url = f"postgresql://{self.user}:{self.pwd}@localhost:15432/{self.database}"
        engine = sql.create_engine(url)

        return engine
    
    def connect_database(
        self, engine)->sql.engine.Connection :

        return engine.connect()
    
    def execute_DQL(
        self, query:str)->pd.DataFrame :

        return pd.read_sql(query, self.create_engine())
    
    def execute_DDL(
        self, query:str)->str:

        conn = self.connect_database(self.create_engine())
        return conn.execute(query)

#### Methods to use

In [3]:
def get_dataframes_to_upload(path:str) :

    list_paths = glob.glob(TABLES_PATH+"*.csv")
    list_names = list()
    for file_path in list_paths :
        name_file = file_path.split('\\')[-1]
        name_dataframe = name_file.replace('.csv','')
        list_names.append(name_file.replace('.csv',''))
        
        #globals()[f"df_{name_dataframe}"] = pd.read_csv(path+name_file)

    return list_names

In [4]:
def create_type_column(
    data:pd.DataFrame)->dict :

    type_columns = dict()
    for column in data.columns :
        if str(data[column].dtype).startswith('float') :
            type_columns[column] = 'REAL'
        elif str(data[column].dtype).startswith('int') :
            type_columns[column] = 'INTEGER'
        elif str(data[column].dtype).startswith('datetime') :
            type_columns[column] = 'DATE'
        elif str(data[column].dtype).startswith('object') :
            max_len = int(data[column].str.len().max()+50)
            if max_len > 1000 :
                type_columns[column] = 'TEXT'
                continue
            type_columns[column] = f'VARCHAR({max_len})'
            
    
    return type_columns

In [5]:
def create_company_id(
    data:pd.DataFrame,
    column_to_search:str)->dict:

    dict_ = dict()
    linkedin_names = list()
    for index in range(len(data)) :
        linkedin_name = data[column_to_search][index]
        if type(linkedin_name) == float :
            linkedin_name = slugify(data['NAME'][index])
            linkedin_names.append(linkedin_name)
        linkedin_names.append(linkedin_name)
    
    for idx, value in enumerate(linkedin_names) :
        dict_[value] = f'co_{idx+1}'
    
    return dict_

#### Load dataframes

In [6]:
df_companies = pd.read_csv(TABLES_PATH+'companies.csv', parse_dates=['FOUNDING_DATE'])
df_people = pd.read_csv(TABLES_PATH+'people.csv', parse_dates=['GROUP_START_DATE','GROUP_END_DATE'])

In [7]:
df_companies['COMPANY_LINKEDIN_NAMES'] = df_companies.COMPANY_LINKEDIN_NAMES.apply(lambda x: eval(x))
df_companies['COMPANY_LINKEDIN_NAMES'] = df_companies.COMPANY_LINKEDIN_NAMES.apply(lambda x: x[0] if len(x) > 0 else np.NaN)

In [8]:
index = df_companies.loc[df_companies['COMPANY_LINKEDIN_NAMES'].isna()].index
for idx in index :
    df_companies.loc[idx, 'COMPANY_LINKEDIN_NAMES'] = slugify(df_companies.loc[idx]['NAME'])

In [9]:
df_companies = df_companies.reset_index().rename(columns={'index':'company_id'})
df_people = df_people.reset_index().rename(columns={'index':'people_id'})

#### Instance Object SQLeto

In [10]:
sqleto = SQLeto(DATABASE_EXERCISE, USER_EXERCISE, PWD_EXERCISE)

In [11]:
sqleto.execute_DDL(
    ''' DROP TABLE IF EXISTS companies CASCADE;
        DROP TABLE IF EXISTS people CASCADE;
    ''')

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x16f61590910>

In [12]:
def creating_tables(
    #self,
    dataframe:pd.DataFrame, 
    name_table:str,  
    primary_key:str, 
    has_fk:bool=False,
    foreign_key:str='',
    reference_table:str=''):

    query = f'CREATE TABLE {name_table}('
    schema = create_type_column(data=dataframe)
    for key, value in schema.items() :
        query+= f'{key} {value},'
    if has_fk :
            query+=f'FOREIGN KEY ({foreign_key}) REFERENCES {reference_table} ({foreign_key}),'
    query+= f'PRIMARY KEY ({primary_key}));'

    return query
    

In [13]:
sqleto.execute_DDL(
    creating_tables(df_companies, 'companies', 'company_id'))

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x16f61710940>

In [14]:
sqleto.execute_DDL(
    creating_tables(
        dataframe=df_people,
        name_table='people',
        primary_key='people_id'))

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x16f61711c90>

In [15]:
df_companies.columns = [c.lower() for c in df_companies.columns]
df_people.columns = [c.lower() for c in df_people.columns]

In [16]:
df_people.to_sql('people', sqleto.create_engine(), if_exists='append', index=False)
df_companies.to_sql('companies', sqleto.create_engine(), if_exists='append', index=False)

711

In [18]:
sqleto.execute_DQL('SELECT * FROM people')

Unnamed: 0,people_id,person_id,company_name,company_li_name,last_title,group_start_date,group_end_date
0,0,9fb750ce-4acd-40d6-a58b-f6718342364f,GoCardless,gocardless,Software Engineer,2019-01-01,2020-01-01
1,1,9fb750ce-4acd-40d6-a58b-f6718342364f,Stealth startup,online-shoe-store,Founder / CTO,2018-01-01,2019-01-01
2,2,9fb750ce-4acd-40d6-a58b-f6718342364f,Arkera,arkera,Software Engineer,2017-01-01,2018-01-01
3,3,9fb750ce-4acd-40d6-a58b-f6718342364f,Imperial College London,imperial-college-london,UTA (Undergraduate Teaching Assistant),2016-01-01,2017-01-01
4,4,15f5d8ed-36ad-4cf7-8748-c50dc9589f59,Splunk,splunk,Software Engineer,2019-10-01,
...,...,...,...,...,...,...,...
5386,5386,341cd181-1e2f-4198-b5b4-928475c17120,L Brands,lbrands,Infosys Consultant,2014-05-01,2014-09-01
5387,5387,341cd181-1e2f-4198-b5b4-928475c17120,Gap Inc.,gap-inc-,Infosys Consultant,2012-03-01,2014-03-01
5388,5388,341cd181-1e2f-4198-b5b4-928475c17120,Infosys,infosys,Systems Engineer,2011-06-01,2014-09-01
5389,5389,341cd181-1e2f-4198-b5b4-928475c17120,The Wind Energy Group ITESM,,Research Assistant,2010-01-01,2010-12-01
