In [2]:
import os
import sys

import pandas as pd

sys.path.append("/home/sergey/drclinics/common")
sys.path.append("/home/sergey/drclinics/reports")

import datetime
from utils import get_path, DATETIME_FORMAT

In [3]:
from universal_connection import UniversalConnection, DBType

In [4]:
import numpy as np

In [5]:
import hashlib

In [6]:
from log import log

In [7]:
sql = """
with temp as (
select 	a.id app_id,
		p_patient.phone,
		case when pc.product_id in (
            select id from product
            where full_name like '%ВЭБ%') then 'VEBMED'
        	else 'ПРОЧИЕ'
        end WEBpartition
--
from appointment a
left join patient on a.patient_id = patient.id 
left join person p_patient on patient.person_id = p_patient.id
--
left join promotion promo on a.promotion_id = promo.id
inner join product_condition pc
   on promo.product_condition_id = pc.id
--
where a.finished at time zone 'UTC' > '2020-03-30'
and a.good
  and not (
    a.patient_id in (
        select patient_id
        from patient_categories pcat
        inner join reference rf
           on rf.id=pcat.reference_id
        where rf.code='TEST'
        )
    or lower(a.report_comment)='тест'
    )
--
)
select distinct phone
from temp 
where webpartition = 'VEBMED'


"""

In [8]:
connection = UniversalConnection('../../../.credentials/telemed/prom.cfg', DBType.Postgres)
df_sql = connection.query(sql)
connection.close()

2020-06-18 16:50:08 connect to postgres database using config file "../../../.credentials/telemed/prom.cfg"
2020-06-18 16:50:08 creating ssh tunnel to 172.16.100.19 as root...
2020-06-18 16:50:09 connect postgres using parameters:
                    database: telemed
				    user: norekhov
				    password: ***masked***
				    host: localhost
				    port: 42107
2020-06-18 16:50:09 @telemed: execute sql:
				    SET TIME ZONE 'Europe/Moscow'
				    None
2020-06-18 16:50:09 @telemed query:
                    with temp as (
				    select 	a.id app_id,
				    		p_patient.phone,
				    		case when pc.product_id in (
				                select id from product
				                where full_name like '%ВЭБ%') then 'VEBMED'
				            	else 'ПРОЧИЕ'
				            end WEBpartition
				    --
				    from appointment a
				    left join patient on a.patient_id = patient.id 
				    left join person p_patient on patient.person_id = p_patient.id
				    --
				    left join promotio

In [9]:
df_sql

Unnamed: 0,phone
0,00000000000
1,13054580697
2,201090919208
3,34622457762
4,393667232467
...,...
31520,9773621722
31521,992928280078
31522,995597072689
31523,9956885541


In [10]:
def correct_phone(phone):
    try:
        if len(phone) == 11 and phone[0] == '7':
            return phone
        elif len(phone) == 11 and phone[0] == '8':
            return '7' + phone[1:]
        elif len(phone) == 10 and phone[0] == '9':
            return '7' + phone
        elif len(phone) == 10 and phone[0] == '4':
            return '7' + phone
        return np.nan
    except:
        return np.nan


In [11]:
df = df_sql.phone.apply(correct_phone)

In [12]:
df = pd.DataFrame(df)

In [13]:
df = df[~df.phone.isna()].reset_index(drop=True)

In [14]:
df

Unnamed: 0,phone
0,74842597922
1,74932298951
2,74957357329
3,74999434486
4,70000000000
...,...
31502,79771032096
31503,79771370522
31504,79773621722
31505,79956885541


In [34]:
df.to_excel('phone_numbers.xlsx', index=False)

In [42]:
def create_hash(filename):
        BLOCK_SIZE = 65536 * 100  # The size of each read from the file
        file_hash = hashlib.sha256()  # Create the hash object, can use something other than `.sha256()` if you wish
        with open(filename, 'rb') as f:  # Open the file to read it's bytes
            fb = f.read(BLOCK_SIZE)  # Read from the file. Take in the amount declared above
            while len(fb) > 0:  # While there is still data being read from the file
                file_hash.update(fb)  # Update the hash
                fb = f.read(BLOCK_SIZE)  # Read the next block from the file

        with open(filename[:-4] + 'sha256', 'w') as f:
            log('writing sha256')
            f.write(file_hash.hexdigest())


In [43]:
create_hash('phone_numbers.xlsx')

2020-06-15 11:28:17 writing sha256


In [None]:
def make_hash(s):
    return hashlib.sha256(s.encode('utf-8')).hexdigest()

In [15]:
def make_hash_MD5(s):
    return hashlib.md5(s.encode('utf-8')).hexdigest()

In [16]:
df['hashed_phone'] = df['phone'].apply(make_hash_MD5)

In [17]:
df

Unnamed: 0,phone,hashed_phone
0,74842597922,80b10973371ef0750cbca7e0c0156bb0
1,74932298951,d1163aada4392ffed41da91ff513a01c
2,74957357329,ff873d050498e067aebea48e36bc2386
3,74999434486,2e983058c743828806502a22b6491133
4,70000000000,4e4ba20f78121c0c351f6829b24ebbfc
...,...,...
31502,79771032096,6d30e48e79a2bdff5826182842bfd2ea
31503,79771370522,1adf028a9532d3d3b6147c959328836f
31504,79773621722,775ffbb0db6374c85b52e48a5997134e
31505,79956885541,9eb17647e8e5b93f5c52f22125564f38


In [18]:
df_hash = pd.DataFrame(df['hashed_phone'])

In [19]:
df_hash

Unnamed: 0,hashed_phone
0,80b10973371ef0750cbca7e0c0156bb0
1,d1163aada4392ffed41da91ff513a01c
2,ff873d050498e067aebea48e36bc2386
3,2e983058c743828806502a22b6491133
4,4e4ba20f78121c0c351f6829b24ebbfc
...,...
31502,6d30e48e79a2bdff5826182842bfd2ea
31503,1adf028a9532d3d3b6147c959328836f
31504,775ffbb0db6374c85b52e48a5997134e
31505,9eb17647e8e5b93f5c52f22125564f38


In [20]:
df_hash.to_excel('hashed_phones_md5.xlsx', index=False)

In [68]:
#print(hashlib.sha256(df['phone'][0].encode('utf-8')).hexdigest())

In [69]:
#type(df['phone'][0])

In [70]:
#print(hashlib.sha256('asd').hexdigest())