In [1]:
import os
import sys

import pandas as pd

sys.path.append("/home/sergey/drclinics/common")
sys.path.append("/home/sergey/drclinics/reports")

import datetime
from utils import get_path, DATETIME_FORMAT

In [2]:
from universal_connection import UniversalConnection, DBType

In [3]:
import re

In [4]:
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords

In [5]:
import pymorphy2

In [6]:
from auto_width7 import auto_columns_width

In [7]:
sql = """
with temp as (
select	a.id app_id,
		--
		case
  		when "source"='SCHEDULED' and specialty_id is not null then (
			select trim(name)
			from specialty s
			where s.id = a.specialty_id
		)
	    else (
			select trim(s.name)
			from specialty s
			inner join doctor_specialty ds
			   on ds.specialty_id = s.id
			inner join doctor_specialty_treatment dst
			   on dst.doctor_specialty_id = ds.id
			where dst.treatment_id = a.treatment_id
			limit 1 -- из-за одной криво заведенной записи.
		) end as specialty,
		--
		a.report_comment,
		case when pc.product_id in (
		            		select id from product
		            		where full_name like '%ВЭБ%') then 'VEBMED'
		        else 'ПРОЧИЕ'
		end promo_type
from appointment a 
left join promotion p
	      on a.promotion_id = p.id
	    left join product_condition pc
	      on pc.id = p.product_condition_id
	      --
	where a.good
      and not (
      	a.patient_id in (
	      	select patient_id
	      	from patient_categories pcat
	      	inner join reference rf
	      	   on rf.id=pcat.reference_id
	      	where rf.code='TEST'
	      	)
	    or lower(a.report_comment)='тест'
      	)
)
select *
from temp 
where promo_type = 'VEBMED'

"""

In [8]:
connection = UniversalConnection('../../../.credentials/telemed/prom.cfg', DBType.Postgres)
df_sql = connection.query(sql)
connection.close()

2020-06-19 11:08:34 connect to postgres database using config file "../../../.credentials/telemed/prom.cfg"
2020-06-19 11:08:34 creating ssh tunnel to 172.16.100.19 as root...
2020-06-19 11:08:35 connect postgres using parameters:
                    database: telemed
				    user: norekhov
				    password: ***masked***
				    host: localhost
				    port: 35213
2020-06-19 11:08:35 @telemed: execute sql:
				    SET TIME ZONE 'Europe/Moscow'
				    None
2020-06-19 11:08:35 @telemed query:
                    with temp as (
				    select	a.id app_id,
				    		--
				    		case
				      		when "source"='SCHEDULED' and specialty_id is not null then (
				    			select trim(name)
				    			from specialty s
				    			where s.id = a.specialty_id
				    		)
				    	    else (
				    			select trim(s.name)
				    			from specialty s
				    			inner join doctor_specialty ds
				    			   on ds.specialty_id = s.id
				    			inner join doctor_specialty_treatment dst
				    			   on

In [9]:
df_sql.head()

Unnamed: 0,app_id,specialty,report_comment,promo_type
0,127919,Терапевт,О.ринит? у ребенка 3-х мес жизни\nРекомендован...,VEBMED
1,144566,Уролог,"консультация уролога - очная,\r\nузи простаты,...",VEBMED
2,135952,Акушер-гинеколог,Диагностическая гипотеза : НМЦ по типу олигоме...,VEBMED
3,253668,Терапевт,жалобы на повышение АД до 160/90\nРекомендован...,VEBMED
4,254688,Терапевт,диагностическая гипотеза : остеохондроз грудно...,VEBMED


In [10]:
df_sql.groupby('specialty')[['app_id']].count().sort_values(by='app_id', ascending=False).head(15)

Unnamed: 0_level_0,app_id
specialty,Unnamed: 1_level_1
Терапевт,30662
Педиатр,10527
Акушер-гинеколог,9143
Невролог,7225
Дерматолог,6294
Оториноларинголог,5551
Гастроэнтеролог,4522
Эндокринолог,2580
Уролог,2473
Кардиолог,1774


In [11]:
top_specialty = list(df_sql.groupby('specialty')[['app_id']].count().sort_values(by='app_id', ascending=False).head(15).index)

In [12]:
top_specialty

['Терапевт',
 'Педиатр',
 'Акушер-гинеколог',
 'Невролог',
 'Дерматолог',
 'Оториноларинголог',
 'Гастроэнтеролог',
 'Эндокринолог',
 'Уролог',
 'Кардиолог',
 'Травматолог-ортопед',
 'Аллерголог-иммунолог',
 'Врач общей практики (Семейный врач)']

In [13]:
# df_sql[df_sql['specialty'].any(top_specialty)]

df_sql[df_sql.apply(lambda x: x['specialty'] in top_specialty, axis=1)].reset_index(drop=True)

Unnamed: 0,app_id,specialty,report_comment,promo_type
0,127919,Терапевт,О.ринит? у ребенка 3-х мес жизни\nРекомендован...,VEBMED
1,144566,Уролог,"консультация уролога - очная,\r\nузи простаты,...",VEBMED
2,135952,Акушер-гинеколог,Диагностическая гипотеза : НМЦ по типу олигоме...,VEBMED
3,253668,Терапевт,жалобы на повышение АД до 160/90\nРекомендован...,VEBMED
4,254688,Терапевт,диагностическая гипотеза : остеохондроз грудно...,VEBMED
...,...,...,...,...
84881,273540,Терапевт,Жалобы на головную боль в области лба. \nАнамн...,VEBMED
84882,273512,Акушер-гинеколог,Диагностическая гипотеза: Аднекист? Варикозная...,VEBMED
84883,273547,Терапевт,Жалобы на боль в горле при глотании умеренной ...,VEBMED
84884,273454,Терапевт,Жалобы на боли в шейном отделе позвоночника.\n...,VEBMED


In [14]:
df_sort = df_sql[df_sql.apply(lambda x: x['specialty'] in top_specialty, axis=1)].reset_index(drop=True)

In [15]:
stop_words = set(stopwords.words('russian_old'))
morph = pymorphy2.MorphAnalyzer()

In [16]:
d_uni = dict()

for spec in top_specialty:
    d_uni[spec] = dict()
    
d_bi = dict()

for spec in top_specialty:
    d_bi[spec] = dict()
    
d_tri = dict()

for spec in top_specialty:
    d_tri[spec] = dict()

In [17]:
for i in range(df_sort.shape[0]):
#for i in range(100):
    temp = re.findall(r"[0123456789А-Яа-я-\\\/ё]+", df_sort['report_comment'][i])
    temp = [morph.parse(x.lower())[0].normal_form for x in temp if not x in stop_words]
    temp = [x for x in temp if len(x) > 2]
    
    for word in temp:
        d_uni[df_sort['specialty'][i]][word] = d_uni[df_sort['specialty'][i]].get(word, 0) + 1
        
        
    temp = ' '.join(temp)
    nltk_tokens = nltk.word_tokenize(temp)
    
    temp_bi = list(nltk.bigrams(nltk_tokens))
    temp_tri = list(nltk.trigrams(nltk_tokens))
    
    for bigram in temp_bi:
        d_bi[df_sort['specialty'][i]][bigram] = d_bi[df_sort['specialty'][i]].get(bigram, 0) + 1
        
    for trigram in temp_tri:
        d_tri[df_sort['specialty'][i]][trigram] = d_tri[df_sort['specialty'][i]].get(trigram, 0) + 1

In [18]:
#d

In [20]:
global writer
writer = None

for spec in d_uni:
    words = pd.DataFrame.from_dict(d_uni[spec], orient='index').sort_index().reset_index().rename(columns={'index':'word', 0:'amount'})
    words = words.sort_values(by=['amount'], ascending=False).reset_index(drop=True).head(5000)
    writer = auto_columns_width('words-segmentation-2.xlsx', words, spec, writer)
   

writer.save()

writer = None

for spec in d_bi:
    words = pd.DataFrame.from_dict(d_bi[spec], orient='index').sort_index().reset_index().rename(columns={'index':'word', 0:'amount'})
    words = words.sort_values(by=['amount'], ascending=False).reset_index(drop=True).head(5000)
    writer = auto_columns_width('bigram-segmentation-2.xlsx', words, spec, writer)
   

writer.save()


writer = None

for spec in d_tri:
    words = pd.DataFrame.from_dict(d_tri[spec], orient='index').sort_index().reset_index().rename(columns={'index':'word', 0:'amount'})
    words = words.sort_values(by=['amount'], ascending=False).reset_index(drop=True).head(5000)
    writer = auto_columns_width('trigram-segmentation-2.xlsx', words, spec, writer)
   

writer.save()

0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 49.3
1 amount 11.9
0 word 45.9
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 45.9
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
1 amount 11.9
0 word 51.0
