In [1]:
import os
import sys

import pandas as pd

sys.path.append("/home/sergey/drclinics/common")
sys.path.append("/home/sergey/drclinics/reports")

import datetime
from utils import get_path, DATETIME_FORMAT

In [2]:
from universal_connection import UniversalConnection, DBType

In [11]:
import numpy as np

In [37]:
import hashlib

In [39]:
from log import log

In [5]:
sql = """
with temp as (
select 	a.id app_id,
		p_patient.phone,
		case when pc.product_id in (
            select id from product
            where full_name like '%ВЭБ%') then 'VEBMED'
        	else 'ПРОЧИЕ'
        end WEBpartition
--
from appointment a
left join patient on a.patient_id = patient.id 
left join person p_patient on patient.person_id = p_patient.id
--
left join promotion promo on a.promotion_id = promo.id
inner join product_condition pc
   on promo.product_condition_id = pc.id
--
where a.finished at time zone 'UTC' > '2020-03-30'
and a.good
  and not (
    a.patient_id in (
        select patient_id
        from patient_categories pcat
        inner join reference rf
           on rf.id=pcat.reference_id
        where rf.code='TEST'
        )
    or lower(a.report_comment)='тест'
    )
--
)
select distinct phone
from temp 
where webpartition = 'VEBMED'


"""

In [26]:
connection = UniversalConnection('../../../.credentials/telemed/prom.cfg', DBType.Postgres)
df_sql = connection.query(sql)
connection.close()

2020-06-15 11:22:43 connect to postgres database using config file "../../../.credentials/telemed/prom.cfg"
2020-06-15 11:22:43 creating ssh tunnel to 172.16.100.19 as root...
2020-06-15 11:22:43 connect postgres using parameters:
                    database: telemed
				    user: norekhov
				    password: ***masked***
				    host: localhost
				    port: 45307
2020-06-15 11:22:43 @telemed: execute sql:
				    SET TIME ZONE 'Europe/Moscow'
				    None
2020-06-15 11:22:43 @telemed query:
                    with temp as (
				    select 	a.id app_id,
				    		p_patient.phone,
				    		case when pc.product_id in (
				                select id from product
				                where full_name like '%ВЭБ%') then 'VEBMED'
				            	else 'ПРОЧИЕ'
				            end WEBpartition
				    --
				    from appointment a
				    left join patient on a.patient_id = patient.id 
				    left join person p_patient on patient.person_id = p_patient.id
				    --
				    left join promotio

In [27]:
df_sql

Unnamed: 0,phone
0,00000000000
1,13054580697
2,201090919208
3,34622457762
4,393667232467
...,...
30245,9773621722
30246,992928280078
30247,995597072689
30248,9956885541


In [28]:
def correct_phone(phone):
    try:
        if len(phone) == 11 and phone[0] == '7':
            return phone
        elif len(phone) == 11 and phone[0] == '8':
            return '7' + phone[1:]
        elif len(phone) == 10 and phone[0] == '9':
            return '7' + phone
        elif len(phone) == 10 and phone[0] == '4':
            return '7' + phone
        return np.nan
    except:
        return np.nan


In [29]:
df = df_sql.phone.apply(correct_phone)

In [30]:
df = pd.DataFrame(df)

In [32]:
df = df[~df.phone.isna()].reset_index(drop=True)

In [33]:
df

Unnamed: 0,phone
0,74932298951
1,74957357329
2,74999434486
3,70000000000
4,70980498687
...,...
30227,79771032096
30228,79771370522
30229,79773621722
30230,79956885541


In [34]:
df.to_excel('phone_numbers.xlsx', index=False)

In [42]:
def create_hash(filename):
        BLOCK_SIZE = 65536 * 100  # The size of each read from the file
        file_hash = hashlib.sha256()  # Create the hash object, can use something other than `.sha256()` if you wish
        with open(filename, 'rb') as f:  # Open the file to read it's bytes
            fb = f.read(BLOCK_SIZE)  # Read from the file. Take in the amount declared above
            while len(fb) > 0:  # While there is still data being read from the file
                file_hash.update(fb)  # Update the hash
                fb = f.read(BLOCK_SIZE)  # Read the next block from the file

        with open(filename[:-4] + 'sha256', 'w') as f:
            log('writing sha256')
            f.write(file_hash.hexdigest())


In [43]:
create_hash('phone_numbers.xlsx')

2020-06-15 11:28:17 writing sha256


In [None]:
def make_hash(s):
    return hashlib.sha256(s.encode('utf-8')).hexdigest()

In [66]:
df['hashed_phone'] = df['phone'].apply(make_hash)

In [67]:
df

Unnamed: 0,phone,hashed_phone
0,74932298951,4aca7249836b43920d85c8bd62783dd844255d548085b8...
1,74957357329,f27185aec12e747b65d9ae6c27a2fa38a98ef45ae98e54...
2,74999434486,d3c59cf69cc97beb2f791e45df7940c9b92b9e4865715a...
3,70000000000,ee2d2245a673009deafaf8d4f7b91959dde26601f9ac71...
4,70980498687,5d2540b9e207e03adf1f49d21c99a3e724ed9fe7bb1b03...
...,...,...
30227,79771032096,bfc9e6f0c2163791e2f1bddafea3b64459db0314fce54a...
30228,79771370522,9e44a8a47af6295d72fd09ee2a6da3235953aac030c546...
30229,79773621722,c67395ff251932f360a012a0f0da89b72004c8e260bf85...
30230,79956885541,f5bb4b4b32c63288a2489ff7589092b5987e2ca701f33d...


In [74]:
df_hash = pd.DataFrame(df['hashed_phone'])

In [76]:
df_hash

Unnamed: 0,hashed_phone
0,4aca7249836b43920d85c8bd62783dd844255d548085b8...
1,f27185aec12e747b65d9ae6c27a2fa38a98ef45ae98e54...
2,d3c59cf69cc97beb2f791e45df7940c9b92b9e4865715a...
3,ee2d2245a673009deafaf8d4f7b91959dde26601f9ac71...
4,5d2540b9e207e03adf1f49d21c99a3e724ed9fe7bb1b03...
...,...
30227,bfc9e6f0c2163791e2f1bddafea3b64459db0314fce54a...
30228,9e44a8a47af6295d72fd09ee2a6da3235953aac030c546...
30229,c67395ff251932f360a012a0f0da89b72004c8e260bf85...
30230,f5bb4b4b32c63288a2489ff7589092b5987e2ca701f33d...


In [77]:
df_hash.to_excel('hashed_phones.xlsx', index=False)

In [68]:
#print(hashlib.sha256(df['phone'][0].encode('utf-8')).hexdigest())

In [69]:
#type(df['phone'][0])

In [70]:
#print(hashlib.sha256('asd').hexdigest())