##### Loading packages

In [1]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
import sqlalchemy
from sqlalchemy import MetaData
from py2neo import Graph, Node, Relationship
from time import time, sleep, ctime
from baseconv import base64, base16
from elasticsearch import Elasticsearch
import glob
import os
import tika
from tika import detector
import re

In [2]:
start = time()

##### Connection to Neo4j

In [3]:
graph = Graph("", auth=("neo4j", ""))
#Réinit Neo4j DB

In [4]:
#graph.run('MATCH (r1:Table)-[t:REPRESENTATION]-(r2:Table) DELETE t')
#graph.run('MATCH (x:Table)-[c:CONTAINS]-(y:Column) DELETE c')
res1 = graph.run('MATCH (n:Table)  RETURN n')
if(len(res1.data())>0): #existing tables
    graph.run('MATCH (t:Table) DETACH DELETE t')
res2 = graph.run('MATCH (n:Column)  RETURN n')
if(len(res2.data())>0): #existing tables
    graph.run('MATCH (c:Column) DETACH DELETE c')

##### Connection to SQLite

In [5]:
db_uri = 'sqlite:////home/ubuntu/d3l/aura_pmi.db'
engine = sqlalchemy.create_engine(db_uri, echo = False)
connection = engine.connect()
#connection.close()

In [6]:
#Drop all
meta = MetaData()
meta.reflect(engine)
meta.drop_all(engine)

##### Connection to ElasticSearch

In [7]:
es = Elasticsearch([{'host':'','port':9200}])
es.indices.delete(index='table_index', ignore=[400, 404])

{'acknowledged': True}

In [8]:
mapping = {
  "mappings": {
     #"_doc": { 
        "date_detection": False, 
        "properties": { 
           "keywords": { "type": "keyword"  }, 

         }
      }
   #}
 }
es.indices.create(index='table_index', ignore=[400, 404], body=mapping)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'table_index'}

In [9]:
def extract_metadata_table(table, sqlite_conn):
    inspector = sqlalchemy.inspect(sqlite_conn)
    columns = []
    for column in inspector.get_columns(table):
        columns.append(str(column['name']))
    count_result = sqlite_conn.execute("SELECT COUNT(*) FROM "+str(table)+" ;")


##### 1-Data preparation

##### --> Loading data

In [10]:
limit = 5000

In [11]:
files_path = "/home/ubuntu/d3l/data/"
table_dfs = []
table_paths = []

for i, file in enumerate(glob.glob(files_path+"/*.csv")):
    if i < limit:
        table_paths.append(file)

In [12]:
print(len(table_paths))

5000


##### --> Generating identifiers

In [None]:
table_ids = []
for table_path in table_paths:
    identifier = base16.encode(int(round(time()*1000)))
    table_ids.append(identifier)
    sleep(0.2)

In [13]:
def generate_table_nodes(table_df, file_path, table_id, neo_conn, db_path):
    #Object
    basic_metadata = {}
    table_id = '''"'''+table_id+'''"'''
    basic_metadata['identifier'] = table_id
    basic_metadata['nbRow'] = table_df.shape[0]
    basic_metadata['nbCol'] = table_df.shape[1]
    basic_metadata['title'] = '''"'''+table_df.name+'''"'''
    
    neo_conn.run('''CREATE (r:Object:Table '''+str(basic_metadata).replace("'", "")+''') RETURN r''')
    
    #Raw representation
    neo_conn.run('''CREATE (r:Raw:Table { identifier: '''+str(table_id)+''', 
            storageType:"file_system", path:"'''+str(file_path.replace("\\","/"))+'''"}) RETURN r''')
    neo_conn.run('''MATCH (o:Object { identifier: '''+str(table_id)+'''}),
              (r:Raw { identifier: '''+str(table_id)+'''}) 
              CREATE (o)-[l:REPRESENTATION]->(r) ''')
    
    #Refined representation
    neo_conn.run('''CREATE (r:Refined:Table { identifier: '''+str(basic_metadata["identifier"])+''', 
            storageType:"sqlite", path:"'''+str(db_path)+'''"}) RETURN r''')
    neo_conn.run('''MATCH (o:Object { identifier: '''+str(basic_metadata["identifier"])+'''}),
              (r:Refined { identifier: '''+str(table_id)+'''}) 
              CREATE (o)-[l:REPRESENTATION]->(r) ''')
    

In [14]:
def generate_columns(df, identifier, neo_conn):
    
    for col in df.columns:
        if len(df[col].dropna().tolist())>0:
            col_properties = {}
            col_type = str(df.infer_objects().dtypes[col])
            col_properties['uniqueness'] = len(df[col].value_counts())/len(df)
            col_properties['name'] = '''"'''+col+'''"'''
            if col_type == 'object': #String data
                col_properties['type'] = '''"STRING"'''
                col_properties['mode'] = '''"'''+re.sub(r'(?u)[^-\w.]', '', df.mode()[col][0])+'''"'''
            elif col_type == 'int64':
                col_properties['type'] = '''"INTEGER"'''
                col_properties['mean'] = np.mean(df[col].dropna())
            elif col_type == 'float64':
                col_properties['type'] = '''"DECIMAL"'''
                col_properties['mean'] = np.mean(df[col].dropna())
            #Create column and Associate column to table object
            neo_conn.run('''CREATE (c:Column '''+str(col_properties).replace("'", "")+''')
                        WITH c MATCH (o:Object {identifier:"'''+identifier+'''" }) 
                        CREATE (o)<-[l:CONTAINS]-(c) RETURN o, c ''' )

##### 2- Indexing and insertion into SQLite

In [15]:
custom_date_parser = lambda x: datetime.strptime(x, "%Y-%m-%d")

In [16]:
for i, table_path in enumerate(table_paths):
    df = pd.read_csv(table_path, sep=',')
    df.name = os.path.basename(table_path).split('.')[0] #omit extension
    identifier = table_ids[i]
    #Insert into ES
    content = {}
    content['identifier'] = identifier
    terms = []
    
    for col in df.columns:
        #col_type = df.infer_objects().dtypes[col]
        col_type = df.dtypes[col]
        #print(df.dtypes)
        if col_type == 'object': #Insert if non numeric
            temp = df[col].dropna().unique()
            col_values = [term for term in temp if len(re.sub("[^a-zA-Z]", "", term))>0]
            terms.extend(col_values)
    
    content['keywords'] = terms       
    res = es.index(index='table_index',doc_type='_doc',body=content)
    #insert into SQLite
    
    df.to_sql(name=df.name, con=engine, if_exists='replace', index=False)
    #Insert into Neo4J
    generate_table_nodes(df, table_path, identifier, graph, db_uri)
    generate_columns(df, identifier, graph)
    if(i % 100 == 0):
        print("-"*5+' '+str(i))
        print("TIME : "+ctime(time()) )

----- 0
TIME : Thu Jan 28 15:47:45 2021
----- 100
TIME : Thu Jan 28 15:49:34 2021
----- 200
TIME : Thu Jan 28 15:51:26 2021
----- 300
TIME : Thu Jan 28 15:53:18 2021
----- 400
TIME : Thu Jan 28 15:55:05 2021
----- 500
TIME : Thu Jan 28 15:56:54 2021
----- 600
TIME : Thu Jan 28 15:58:46 2021
----- 700
TIME : Thu Jan 28 16:00:30 2021
----- 800
TIME : Thu Jan 28 16:02:27 2021
----- 900
TIME : Thu Jan 28 16:04:29 2021
----- 1000
TIME : Thu Jan 28 16:06:15 2021
----- 1100
TIME : Thu Jan 28 16:08:03 2021
----- 1200
TIME : Thu Jan 28 16:09:51 2021
----- 1300
TIME : Thu Jan 28 16:11:38 2021
----- 1400
TIME : Thu Jan 28 16:13:41 2021
----- 1500
TIME : Thu Jan 28 16:15:30 2021
----- 1600
TIME : Thu Jan 28 16:17:33 2021
----- 1700
TIME : Thu Jan 28 16:19:31 2021
----- 1800
TIME : Thu Jan 28 16:21:30 2021
----- 1900
TIME : Thu Jan 28 16:23:33 2021
----- 2000
TIME : Thu Jan 28 16:25:37 2021
----- 2100
TIME : Thu Jan 28 16:27:37 2021
----- 2200
TIME : Thu Jan 28 16:29:30 2021
----- 2300
TIME : Thu J

###### 3- Insertion in Neo4j

##### --> Generate Table Nodes and Groups

In [17]:
done = time()
elapsed = done - start
print("*"*20)
print("STARTED : "+ctime(start) )
print("END : "+ctime(done) )
print("TIME ELAPSED:" + str(elapsed/60) + " MINUTES" )

********************
STARTED : Thu Jan 28 15:31:00 2021
END : Thu Jan 28 17:28:21 2021
TIME ELAPSED:117.34288415908813 MINUTES
