In [30]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer


# substitute this later by data from Ans
texts = [
    "Data Science is the interdisciplinary field that uses scientific methods, algorithms, processes, and systems to extract knowledge and insights from structured and unstructured data.",
    "Knowledge Engineering is a subfield of artificial intelligence (AI) that focuses on developing and maintaining knowledge-based systems, which use domain-specific knowledge to solve complex problems.",
    "In Data Science, machine learning algorithms are often used to build predictive models and make data-driven decisions.",
    "Knowledge Engineering involves the creation and management of knowledge bases, ontologies, and expert systems to facilitate knowledge representation and reasoning.",
    "Data Scientists use programming languages like Python, R, and Julia to analyze data, create visualizations, and build machine learning models.",
    "Knowledge Engineers work on designing knowledge acquisition systems, defining ontologies, and developing inference engines to enable automated reasoning.",
    "Data Science plays a crucial role in various industries, including healthcare, finance, marketing, and technology, by leveraging data to improve decision-making processes.",
    "Knowledge Engineering is fundamental in developing expert systems that can provide expert-level advice and decision support in specialized domains.",
]


# Stopwords list, yknow common words like uhh.. in the and, you get the point
stop_words = set(stopwords.words('english'))

# stemmer and lemmatizer, this reduces everything to their stem by edit/removing parts of the word. choose either stemmer or lemmatizer.. 
# look up wat the difference is if you dunno but they do the same thing, in a different way. Lemmatizer better tho

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    
    # lowercasing
    text = text.lower()
    
    # remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    
    # tokenize with NLTK
    tokens = word_tokenize(text)
    
    # get rid of stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # option 1: stemmer
    # tokens = [stemmer.stem(word) for word in tokens]
    # option 2: lemmatizer
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # there's more that we can do to make it fit our data, which we still need
    return ' '.join(tokens)  # Return as a string for the vectorizer

# preproces line by line

preprocessed_texts = [preprocess_text(sentence) for sentence in texts]

vectorizer = CountVectorizer(binary=True)  # binary=True will ensure the result is one-hot encoded
sparse_matrix = vectorizer.fit_transform(preprocessed_texts)

for textin, textclean in zip(texts, preprocessed_texts):
    print(textin)
    print("clean and tokenized:", textclean)
    print('')
    


Data Science is the interdisciplinary field that uses scientific methods, algorithms, processes, and systems to extract knowledge and insights from structured and unstructured data.
clean and tokenized: data science interdisciplinary field us scientific method algorithm process system extract knowledge insight structured unstructured data

Knowledge Engineering is a subfield of artificial intelligence (AI) that focuses on developing and maintaining knowledge-based systems, which use domain-specific knowledge to solve complex problems.
clean and tokenized: knowledge engineering subfield artificial intelligence ai focus developing maintaining knowledgebased system use domainspecific knowledge solve complex problem

In Data Science, machine learning algorithms are often used to build predictive models and make data-driven decisions.
clean and tokenized: data science machine learning algorithm often used build predictive model make datadriven decision

Knowledge Engineering involves the cr

In [32]:
for sentence in preprocessed_texts:
    word_counts = Counter(sentence.split())
    
    freq_str = ', '.join([f"{word}: {count}" for word, count in word_counts.items()])
    print(freq_str)
    print("")



data: 2, science: 1, interdisciplinary: 1, field: 1, us: 1, scientific: 1, method: 1, algorithm: 1, process: 1, system: 1, extract: 1, knowledge: 1, insight: 1, structured: 1, unstructured: 1

knowledge: 2, engineering: 1, subfield: 1, artificial: 1, intelligence: 1, ai: 1, focus: 1, developing: 1, maintaining: 1, knowledgebased: 1, system: 1, use: 1, domainspecific: 1, solve: 1, complex: 1, problem: 1

data: 1, science: 1, machine: 1, learning: 1, algorithm: 1, often: 1, used: 1, build: 1, predictive: 1, model: 1, make: 1, datadriven: 1, decision: 1

knowledge: 3, engineering: 1, involves: 1, creation: 1, management: 1, base: 1, ontology: 1, expert: 1, system: 1, facilitate: 1, representation: 1, reasoning: 1

data: 2, scientist: 1, use: 1, programming: 1, language: 1, like: 1, python: 1, r: 1, julia: 1, analyze: 1, create: 1, visualization: 1, build: 1, machine: 1, learning: 1, model: 1

knowledge: 2, engineer: 1, work: 1, designing: 1, acquisition: 1, system: 1, defining: 1, ontolog