In [None]:
from docx import Document

def is_heading(paragraph):
    """Checks if a paragraph is a heading.

    :param paragraph:
    :return:
    """
    if paragraph.style.name=='Heading':
        return True


def iterate_document_sections(document):
    """For each headed section, generate a sequence of paragraphs.
    Each sequence starts with a headed paragraph, followed by text paragraphs.

    :param document: Current Comet Issue
    :return: an article
    """
    paragraphs = [document.paragraphs[0]]
    for paragraph in document.paragraphs[1:]:
        if is_heading(paragraph):
            yield paragraphs
            paragraphs = [paragraph]
            continue
        paragraphs.append(paragraph)
    yield paragraphs


def create_document_from_paragraphs(paragraphs):
    """Iterates through the paragraphs containing articles and splits them into separate files.

    :param paragraphs: Article text
    :return: New document with a single article.
    """
    new_doc = Document()
    n = 0
    for count, words in enumerate(paragraphs):
        n += 1
        new_content = words.text
        new_doc.add_paragraph(new_content)
        print (new_content)
    #new_doc.save('new' + str(n) + '.docx')
    return new_doc
    
document = docx.Document('/Users/oskar/Python/small_projects/Syndromes_Flash_Cards/Essential genetic syndromes.docx')
n = 0
for paragraphs in iterate_document_sections(document):
    n +=1 
    create_document_from_paragraphs(paragraphs).save('separated/new' + str(n) + '.docx')

In [115]:
from docx import Document
import re
import pandas as pd
import glob
syndrome_name = []
information_all = []
for name in glob.glob('/Users/oskar/Python/small_projects/Syndromes_Flash_Cards/separated/*.docx'):
    document = Document(name)
    syndrome_name.append(document.paragraphs[0].text)
    information = []
    for para in document.paragraphs[1:]:
        information.append((para.text))
    information_all.append(information)
len(syndrome_name)

162

In [None]:
for name, element in zip(syndrome_name,information_final):
    print(name, len(element))

In [116]:
information_final = []
for element in information_all:
    information_final.append(re.split('Genetics, |Clinical findings/Dysmorphic features, |Etiology, |Pathogenesis, |Genetic testing/diagnosis, |Others,',', '.join(element)))

column_names = ['empty','Genetics','Clinical findings/Dysmorphic features','Etiology', 'Pathogenesis','Genetic testing/diagnosis','Others']
df = pd.DataFrame(information_final, columns=column_names)
df.drop('empty', axis=1, inplace=True)
df['Syndrome name'] = syndrome_name
syndromes_main = df['Syndrome name']

In [117]:
#import figures links
df_figures = pd.read_csv('syndrome_category_wordcloud_figure.csv', sep=';',usecols=['Syndrome name','Category','png links','figure'])
syndromes_fig = df_figures['Syndrome name']

In [120]:
df_figures

Unnamed: 0,Syndrome name,Category,png links,figure
0,Multiple endocrine neoplasia type 1 (MEN1),Oncologic Disorders,wordcloud_genereviews/Multiple endocrine neopl...,figures/MEN1.png
1,Dyskeratosis congenita,Oncologic Disorders,wordcloud_genereviews/Dyskeratosis congenita.png,figures/Dyskeratosis_congenita.jpeg
2,Fragile X,Neurological disorders,wordcloud_genereviews/Fragile X.png,figures/Fragile_X.png
3,Familial Adenomatous Polyposis,Oncologic Disorders,wordcloud_genereviews/Familial Adenomatous Pol...,figures/FAP.png
4,Urea cycle disorders,Biochemical Disorders,wordcloud_genereviews/Urea cycle disorders.png,figures/Urea_Cycle.jpg
...,...,...,...,...
157,Polycystic Kidney Disease (AD and AR),Renal Disorders,wordcloud_genereviews/Polycystic Kidney Diseas...,figures/PKD.png
158,Acute intermittent porphyria (AIP),Hematologic Disorders,wordcloud_genereviews/Acute intermittent porph...,figures/AIP.png
159,Ehlers-Danlos syndrome hypermobility (type III),Connective Tissue Disorders,wordcloud_genereviews/Ehlers-Danlos syndrome h...,figures/hypermobility-beighton-scoring-system-...
160,Tyrosinemia type I,Biochemical Disorders,wordcloud_genereviews/Tyrosinemia type I.png,figures/Tyrosinemia type I.png


In [118]:
list(set(syndromes_fig) - set(syndromes_main))

[]

In [119]:
df_final = pd.merge(df, df_figures, on='Syndrome name', how='inner')
df_final.to_csv('df_for_app.csv')
df_final

Unnamed: 0,Genetics,Clinical findings/Dysmorphic features,Etiology,Pathogenesis,Genetic testing/diagnosis,Others,Syndrome name,Category,png links,figure
0,"-Gene: MEN1 (Menin; 11q13), -AD,",-Varying combinations of >20 endocrine and non...,"-Prevalence 1:10,000 to 1:100,000,",-Menin mainly in nucleus; expressed in all tis...,-Diagnosis: identification of one or both of t...,-MEN1 is tumor suppressor that follows Knudso...,Multiple endocrine neoplasia type 1 (MEN1),Oncologic Disorders,wordcloud_genereviews/Multiple endocrine neopl...,figures/MEN1.png
1,"-Genes: MT-TL1 (>80%); MT-ND5 (<10%), -Materna...",-Multisystem disorder; onset between 2 and 40y...,"-Prevalence estimated to be 0.2:100,000 in Jap...",-11 mt-tRNAs (mainly MT-TL1) involved in MELAS...,-Diagnosis based on clinical diagnostic criter...,"-mtDNA encodes 22 tRNAs, -During acute stroke...","MELAS (mitochondrial encephalomyopathy, lactic...",Mitochondrial Disorders,wordcloud_genereviews/MELAS (mitochondrial enc...,figures/MELAS.png
2,"-FMR-1 (FMRP, Fragile X Mental Retardation Pro...","1) Fragile X syndrome:, -FMR1 full mutation or...","-16 to 25:100,000 males affected with fragile ...",->200 repeats lead to silencing by methylation...,- >99% with increased number of CGG trinucleot...,-CGG repeats expand exclusively during transm...,Fragile X,Neurological disorders,wordcloud_genereviews/Fragile X.png,figures/Fragile_X.png
3,-Gene: APC (Adenomatous polyposis coli protein...,"1) FAP:, -Colon cancer predisposition syndrome...","-Prevalence of FAP: 1:7,000 to 1:30,000 live b...",-Pathogenic APC variants produce usually trunc...,-APC-associated polyposis condition should be ...,-Colorectal screening beginning at age 10-12 ...,Familial Adenomatous Polyposis,Oncologic Disorders,wordcloud_genereviews/Familial Adenomatous Pol...,figures/FAP.png
4,-Gene: GCDH (glutaryl-CoA dehydrogenase; 19p13...,-Macrocephaly at birth (75%); acute encephalop...,"-Prevalence: 1 in 100,000, -Prevalence in Amis...",-Deficiency in glutaryl-CoA dehydrogenase: lys...,"-Elevated glutaric acid, 3-hydroxyglutaric aci...","-Sarah’s painting Ruthie’s prayer, -Bleeding ...",Glutaric acidemia Type I,Biochemical Disorders,wordcloud_genereviews/Glutaric acidemia Type I...,figures/GA1.png
...,...,...,...,...,...,...,...,...,...,...
157,-Gene: PKD1/PKD2 (Polycystin-1; 16p13.1/Polycy...,"1) ADPKD:, -Generally late-onset multisystem d...",-ADPKD: most common potentially lethal single-...,-PKD-related proteins are involved with functi...,-ADPKD: PKD1 (78% of cases; 97%/3%); PKD2 (12%...,-PKD2 mutations show later onset and slower r...,Polycystic Kidney Disease (AD and AR),Renal Disorders,wordcloud_genereviews/Polycystic Kidney Diseas...,figures/PKD.png
158,"-HMBS (11q23.3), -AD; only 1% de novo; low pen...","-Onset after puberty, -Life-threatening acute ...","-5 in 10,000 (but penetrance is only ~1%),",-Partial deficiency of porphobilinogen deamina...,-Increased urine ALA and porphobilinogen (PBG)...,-Urine may be reddish-brown or red; color is ...,Acute intermittent porphyria (AIP),Hematologic Disorders,wordcloud_genereviews/Acute intermittent porph...,figures/AIP.png
159,"-Genes unknown, -AD,",-Joint hypermobility; recurrent joint dislocat...,"-Prevalence estimates ranging between 1:5,000 ...","-Abnormal dermal elastic fibers,",-No biochemical or genetic tests clinically av...,-Least severe type of EDS,Ehlers-Danlos syndrome hypermobility (type III),Connective Tissue Disorders,wordcloud_genereviews/Ehlers-Danlos syndrome h...,figures/hypermobility-beighton-scoring-system-...
160,-Gene: ARSA (Arylsulfatase A) or Saposin B (Ac...,"-Late-infantile: onset < 30 mths; weakness, hy...","-Prevalence between 1:40,000 and 1:160,000,",-Lysosomal Sphingolipidosis; arylsulfatase A (...,-Progressive neurologic dysfunction --> MRI ev...,-Pseudodeficiency (5-15% of normal activity) ...,Arylsulfatase A Deficiency (Metachromatic Leuk...,Biochemical Disorders,wordcloud_genereviews/Arylsulfatase A Deficien...,figures/Arylsulfatase.png
