In [None]:
from docx import Document

def is_heading(paragraph):
    """Checks if a paragraph is a heading.

    :param paragraph:
    :return:
    """
    if paragraph.style.name=='Heading':
        return True


def iterate_document_sections(document):
    """For each headed section, generate a sequence of paragraphs.
    Each sequence starts with a headed paragraph, followed by text paragraphs.

    :param document: Current Comet Issue
    :return: an article
    """
    paragraphs = [document.paragraphs[0]]
    for paragraph in document.paragraphs[1:]:
        if is_heading(paragraph):
            yield paragraphs
            paragraphs = [paragraph]
            continue
        paragraphs.append(paragraph)
    yield paragraphs


def create_document_from_paragraphs(paragraphs):
    """Iterates through the paragraphs containing articles and splits them into separate files.

    :param paragraphs: Article text
    :return: New document with a single article.
    """
    new_doc = Document()
    n = 0
    for count, words in enumerate(paragraphs):
        n += 1
        new_content = words.text
        new_doc.add_paragraph(new_content)
        print (new_content)
    #new_doc.save('new' + str(n) + '.docx')
    return new_doc
    
document = docx.Document('/Users/oskar/Python/small_projects/Syndromes_Flash_Cards/Essential genetic syndromes.docx')
n = 0
for paragraphs in iterate_document_sections(document):
    n +=1 
    create_document_from_paragraphs(paragraphs).save('separated/new' + str(n) + '.docx')

In [13]:
from docx import Document
import re
import pandas as pd
import glob
syndrome_name = []
information_all = []
for name in glob.glob('/Users/oskar/Python/small_projects/Syndromes_Flash_Cards/separated/*.docx'):
    document = Document(name)
    syndrome_name.append(document.paragraphs[0].text)
    information = []
    for para in document.paragraphs[1:]:
        information.append((para.text))
    information_all.append(information)

len(syndrome_name)

163

In [None]:
for name, element in zip(syndrome_name,information_final):
    print(name, len(element))

In [15]:
information_final = []
for element in information_all:
    information_final.append(re.split('Genetics, |Clinical findings/Dysmorphic features, |Etiology, |Pathogenesis, |Genetic testing/diagnosis, |Others,',', '.join(element)))

column_names = ['empty','Genetics','Clinical findings/Dysmorphic features','Etiology', 'Pathogenesis','Genetic testing/diagnosis','Others']
df = pd.DataFrame(information_final, columns=column_names)
df.drop('empty', axis=1, inplace=True)
df['Syndrome name'] = syndrome_name
df.to_csv('test.csv')
df.iloc[5]

Genetics                                  -Gene: GALC (Galactocerebrocidase, 14q31), -AR, 
Clinical findings/Dysmorphic features    1) Infantile-onset (onset <12 months): progres...
Etiology                                           -1:250,000 in US; 1:100,000 in Europe, 
Pathogenesis                             -Galactocerebrosidase: liposomal hydrolysis of...
Genetic testing/diagnosis                -More than 200 pathogenic variants, 30-kb dele...
Others                                    -On NBS --> HSCT decreases morbidity and mort...
Syndrome name                             Galactocerebrosidase deficiency (Krabbe Disease)
Name: 5, dtype: object

In [16]:
import glob

pngs_link = []
for file in glob.glob("wordcloud_genereviews/*.png"):
    pngs_link.append(file)
pngs_link

['wordcloud_genereviews/Waardenburg syndrome.png',
 'wordcloud_genereviews/Fanconi anemia.png',
 'wordcloud_genereviews/Congenital contractural arachnodactly (Beals syndrome).png',
 'wordcloud_genereviews/Factor V Leiden Thrombophilia.png',
 'wordcloud_genereviews/Smith - Lemli- Opitz.png',
 'wordcloud_genereviews/Huntington Disease.png',
 'wordcloud_genereviews/Propionic Acidemia.png',
 'wordcloud_genereviews/Alpha-1 Antitrypsin Deficiency.png',
 'wordcloud_genereviews/X-linked agammaglobulinemia.png',
 'wordcloud_genereviews/Glutaric acidemia Type I.png',
 'wordcloud_genereviews/Hereditary Neuropathy with Liability to Pressure Palsies.png',
 'wordcloud_genereviews/Friedreich Ataxia.png',
 'wordcloud_genereviews/Diastrophic Dysplasia.png',
 'wordcloud_genereviews/Neurofibromatosis type 2.png',
 'wordcloud_genereviews/Limb-Girdle Muscular Dystrophy.png',
 'wordcloud_genereviews/Familial Mediterranean Fever.png',
 'wordcloud_genereviews/X-linked adrenal hypoplasia congenita.png',
 'word

In [None]:
pngs = []
for element in pngs_link:
    pngs.append(element.replace('wordcloud_genereviews/',''))

pngs_final = []  
for element in pngs:
    pngs_final.append(element.replace('.png',''))
pngs_final

In [18]:
import pandas as pd
df_links = pd.DataFrame(list(zip(pngs_final, pngs_link)),columns=['Syndrome name','png links'])

In [None]:
df = pd.read_csv('Syndromes_for_df.csv', sep=';')
df

In [36]:
df_merged = df.merge(df_links, on='Syndrome name',how='left')
df_merged.to_csv('syndromes_for_df_png.csv')

In [37]:
df_figures = pd.read_csv('category_wordcloud_figure.csv', sep=';',usecols=['Category','png links','figure'])
figure_links = df_figures['figure']
df_merged['figure'] = figure_links
df_merged.to_csv('df_for_app.csv')

Unnamed: 0.1,Unnamed: 0,Genetics,Clinical findings/Dysmorphic features,Etiology,Pathogenesis,Genetic testing/diagnosis,Others,Syndrome name,Category,png links,figure
0,0,"-Gene: MEN1 (Menin; 11q13), -AD,",-Varying combinations of >20 endocrine and non...,"-Prevalence 1:10,000 to 1:100,000,",-Menin mainly in nucleus; expressed in all tis...,-Diagnosis: identification of one or both of t...,-MEN1 is tumor suppressor that follows Knudson...,Multiple endocrine neoplasia type 1 (MEN1),Oncologic Disorders,wordcloud_genereviews/Multiple endocrine neopl...,figures/MEN1.png
1,1,-Genes: DKC1 (XL; 20-25%); TINF2 (AD; 12-20%);...,-1) dysplastic nails; 2) lacy reticular pigmen...,"-Rare, 2015: 400 families,",-TTAGGG nucleotide repeats fold back to create...,-Individuals with DC have abnormally short tel...,,Dyskeratosis congenita,Oncologic Disorders,wordcloud_genereviews/Dyskeratosis congenita.png,
2,2,"-FMR-1 (FMRP, Fragile X Mental Retardation Pro...","1) Fragile X syndrome:, -FMR1 full mutation or...","-16 to 25:100,000 males affected with fragile ...",->200 repeats lead to silencing by methylation...,- 99% with increased number of CGG trinucleoti...,-CGG repeats expand exclusively during transm...,Fragile X,Neurological disorders,wordcloud_genereviews/Fragile X.png,figures/Fragile_X.png
3,3,-Gene: APC (Adenomatous polyposis coli protein...,"1) FAP:, -Colon cancer predisposition syndrome...","-Prevalence of FAP: 1:7,000 to 1:30,000 live b...",-Pathogenic APC variants produce usually trunc...,-APC-associated polyposis condition should be ...,-Colorectal screening beginning at age 10-12 y...,Familial Adenomatous Polyposis,Oncologic Disorders,wordcloud_genereviews/Familial Adenomatous Pol...,figures/FAP.png
4,4,-Five catalytic enzymes: 1) CPS1 (Carbamoylpho...,1) NAGS deficiency: mimic of CPS1 deficiency (...,"-UCDs is estimated to be at least 1:35,000 bir...",-NH3 is detoxificated to glutamine inc. gluta...,1) Plasma NH3 of > 150 μmol/L (with nl anion g...,,Urea cycle disorders,Biochemical Disorders,wordcloud_genereviews/Urea cycle disorders.png,figures/Urea_Cycle.jpg
...,...,...,...,...,...,...,...,...,...,...,...
158,158,-Gene: PKD1/PKD2 (Polycystin-1; 16p13.1/Polycy...,"1) ADPKD:, -Generally late-onset multisystem d...",-ADPKD: most common potentially lethal single-...,-PKD-related proteins are involved with functi...,-ADPKD: PKD1 (78% of cases; 97%/3%); PKD2 (12%...,-PKD2 mutations show later onset and slower ra...,Polycystic Kidney Disease (AD and AR),Renal Disorders,,figures/PKD.png
159,159,"-HMBS (11q23.3), -AD; only 1% de novo; low pen...","-Onset after puberty, -Life-threatening acute ...","-5 in 10,000 (but penetrance is only ~1%),",-Partial deficiency of porphobilinogen deamina...,-Increased urine ALA and porphobilinogen (PBG)...,-Urine may be reddish-brown or red; color is ...,Acute intermittent porphyria (AIP),Hematologic Disorders,wordcloud_genereviews/Acute intermittent porph...,figures/AIP.png
160,160,"-Genes unknown, -AD,",-Joint hypermobility; recurrent joint dislocat...,"-Prevalence estimates ranging between 1:5,000 ...","-Abnormal dermal elastic fibers,",-No biochemical or genetic tests clinically av...,-Least severe type of EDS,Ehlers-Danlos syndrome hypermobility (type III),Connective Tissue Disorders,wordcloud_genereviews/Ehlers-Danlos syndrome h...,figures/hypermobility-beighton-scoring-system-...
161,161,"-Gene: FAH (fumarylacetoacetase), -AR,",-Untreated: young infants with severe liver in...,"-1 in 100,000; in general US population, carri...",-FAH is terminal enzyme in the tyrosine catabo...,-NBS: presence of succinylacetone (MS/MS): pat...,"-Treatment: , 1) Nitisinone/NTBC (blocks p-HPP...",Tyrosinemia type I,Biochemical Disorders,wordcloud_genereviews/Tyrosinemia type I.png,figures/Tyrosinemia type I.png


In [33]:

df_merged

0                                          figures/MEN1.png
1                                                       NaN
2                                     figures/Fragile_X.png
3                                           figures/FAP.png
4                                    figures/Urea_Cycle.jpg
                                ...                        
159                                         figures/AIP.png
160       figures/hypermobility-beighton-scoring-system-...
161                          figures/Tyrosinemia type I.png
162                                figures/Cri-du-Chat.jpeg
figure    0                                       figure...
Name: figure, Length: 164, dtype: object

In [32]:
df_merged

0                                       figures/MEN1.png
1                                                    NaN
2                                  figures/Fragile_X.png
3                                        figures/FAP.png
4                                 figures/Urea_Cycle.jpg
                             ...                        
158                                      figures/PKD.png
159                                      figures/AIP.png
160    figures/hypermobility-beighton-scoring-system-...
161                       figures/Tyrosinemia type I.png
162                             figures/Cri-du-Chat.jpeg
Name: figure, Length: 163, dtype: object