In [1]:
from docx import Document
import pandas as pd
import numpy as np
import re
import spacy as sp
import nltk
from nltk.stem.snowball import DanishStemmer
import lemmy

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def read_docx(file_path):
    doc = Document(file_path)
    text = []
    for paragraph in doc.paragraphs:
        text.append(paragraph.text)
    return ' '.join(text)

In [3]:
# Path to your .docx file
docx_file_path = '/Users/majastyrkandersen/Desktop/bispebjerg/interviews_merged.docx'

# Read the content of the .docx file
document_text = read_docx(docx_file_path)

In [4]:
# To lower case
lower_cased = document_text.lower()

In [5]:
# Remove punctuation
pattern = r'[^\w\s]'
cleaned = re.sub(pattern, '', lower_cased)

In [6]:
# Remove stopwords using spaCy

nlp = sp.load("da_core_news_sm")

In [7]:
# Rename un-tokenized data
text = cleaned

# Process the text data
doc = nlp(text)

In [8]:
# Remove stopwords

filtered_tokens = [token.text for token in doc if not token.is_stop]

In [9]:
# Remove names Oliver, Arthur, Sonja, and Charlotte
names = ["oliver", "sonja", "arthuer", "charlotte"]

tokens_without_names = list(filter(lambda x: x not in names, filtered_tokens))

In [10]:
# Normalizing the text (stemming and lemmatization) using NLTK

stemmer = DanishStemmer()
stemmed_tokens = [stemmer.stem(token) for token in tokens_without_names]

In [11]:
print(stemmed_tokens)

['regn', 'tag', 'cirka', '40', 'minut', 'plus', 'minus', 'afhæng', 'får', 'fyld', 'opfølg', 'spørgsmål', 'ting', 'bund', 'grund', 'bar', 'godt', 'hør', 'dit', 'arbejd', 'kommunik', 'patient', 'start', 'godt', 'hør', 'fortæl', 'fik', 'radiograf', 'arthur', 'jam', 'hed', 'arthur', '32', 'gang', '24', '23', 'vidst', 'helt', 'arbejd', 'haft', 'lyst', 'arbejd', 'mennesk', 'sid', 'kontor', 'snak', 'patient', 'gaml', 'mennesk', 'bar', 'kombination', 'samtid', 'gad', 'sygeplejersk', 'teknologisk', 'comput', 'åbent', 'hus', 'uddan', 'tænk', 'tja', 'godt', 'fremtid', 'prøved', 'kend', 'når', 'find', 'dit', 'hul', 'samfund', 'passed', 'bar', 'person', 'teknologisk', 'scanning', 'spænd', 'samtid', 'patient', 'sin', 'histori', 'tro', 'ensart', 'arbejd', 'patient', 'anderled', 'ensart', 'sur', 'glad', 'bar', 'normal', 'fik', 'tænk', 'okay', 'lav', 'typ', 'arbejd', 'udfordring', 'radiograf', 'god', 'oplev', 'selvfølg', 'lang', 'tid', 'radiograf', 'arthur', '6', 'år', 'radiograf', 'dit', 'arbejdsområd

In [12]:
# Create an instance of the standalone lemmatizer (without POS (Parts of Speech))
lemmatizer = lemmy.load("da")
lemmatized_tokens = [lemmatizer.lemmatize("", token) for token in stemmed_tokens]

In [13]:
print(lemmatized_tokens)

[['regne', 'regn'], ['tagge', 'tage', 'tag'], ['cirka'], ['40'], ['minut'], ['plusse', 'plus'], ['minusse', 'minus'], ['afhænge'], ['få', 'får'], ['fylde', 'fyld'], ['opfølge'], ['spørgsmål'], ['tinge', 'ting'], ['bunde', 'bund'], ['grunde', 'grund'], ['bar', 'bare', 'bære'], ['godt', 'godte', 'god'], ['hør', 'høre'], ['din', 'dit'], ['arbejde'], ['kommunik'], ['patient'], ['start', 'starte'], ['godt', 'godte', 'god'], ['hør', 'høre'], ['fortælle'], ['få'], ['radiograf'], ['arthur'], ['jamme'], ['hedde', 'hed'], ['arthur'], ['32'], ['gange', 'gang'], ['24'], ['23'], ['vide'], ['hel', 'helt'], ['arbejde'], ['have'], ['lyste', 'lyse', 'lys', 'lyst'], ['arbejde'], ['mennesk'], ['sid', 'sidde'], ['kontor'], ['snakke', 'snak'], ['patient'], ['gamle'], ['mennesk'], ['bar', 'bare', 'bære'], ['kombination'], ['samtid'], ['gide', 'gad'], ['sygeplejersk'], ['teknologisk'], ['comput', 'computte'], ['åben'], ['hus', 'hu', 'huse'], ['uddanne'], ['tænke'], ['tja'], ['godt', 'godte', 'god'], ['fremti

In [14]:
# We got 6800 tokens
number_of_elements = len(lemmatized_tokens)
print(number_of_elements)

6767
