# VKM Dataset Opschoning
Dit notebook schoont de VKM dataset op voor AI training

In [38]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import ast

# Download Nederlandse stopwoorden (1x nodig)
try:
    stopwords.words('dutch')
except LookupError:
    nltk.download('stopwords', quiet=True)

# Laad dataset
df = pd.read_csv('Uitgebreide_VKM_dataset.csv')
print(f"Dataset geladen: {len(df)} rijen, {len(df.columns)} kolommen")

Dataset geladen: 211 rijen, 20 kolommen


## Vervang lege/nutteloze waarden naar 'Nog te bepalen'

In [39]:
for col in df.columns:
    if df[col].dtype == 'object':
        for i in df.index:
            cell_value = str(df.loc[i, col]).strip()
            
            # Speciale behandeling voor module_tags (array strings)
            if col == 'module_tags':
                if cell_value in ['[]', "['ntb']", '["ntb"]', 'ntb']:
                    df.loc[i, col] = 'Nog te bepalen'
            else:
                # Voor andere kolommen: normale check
                if pd.isna(df.loc[i, col]) or cell_value == '' or cell_value.lower() in ['ntb', 'nog niet bekend', 'nog te formuleren']:
                    df.loc[i, col] = 'Nog te bepalen'

# Verwijder onnodige kolommen
df.drop(['Rood', 'Groen', 'Blauw', 'Geel'], axis=1, inplace=True, errors='ignore')

for col in df.columns:
    if df[col].dtype == 'object':
        ntb_count = (df[col] == 'Nog te bepalen').sum()
        if ntb_count > 0:
            perc = round((ntb_count / len(df)) * 100, 1)
            print(f"{col}: {ntb_count} ({perc}%)")

shortdescription: 30 (14.2%)
description: 4 (1.9%)
content: 4 (1.9%)
learningoutcomes: 55 (26.1%)
module_tags: 30 (14.2%)


## Vul shortdescription met description + content

In [40]:
ntb_voor = (df['shortdescription'] == 'Nog te bepalen').sum()
print(f"Vooraf shortdescription 'Nog te bepalen': {ntb_voor}")

for i in df.index:
    if df.loc[i, 'shortdescription'] == 'Nog te bepalen':
        description = str(df.loc[i, 'description']) if pd.notna(df.loc[i, 'description']) else ''
        content = str(df.loc[i, 'content']) if pd.notna(df.loc[i, 'content']) else ''
        df.loc[i, 'shortdescription'] = description + ' ' + content

ntb_na = (df['shortdescription'] == 'Nog te bepalen').sum()
print(f"Na vullen shortdescription 'Nog te bepalen': {ntb_na}")

Vooraf shortdescription 'Nog te bepalen': 30
Na vullen shortdescription 'Nog te bepalen': 0


## Vul module_tags met keywords uit shortdescription

In [41]:
dutch_stops = set(stopwords.words('dutch'))

ntb_voor = (df['module_tags'] == 'Nog te bepalen').sum()
print(f"Vooraf module_tags 'Nog te bepalen': {ntb_voor}")

# Onthoud welke rijen we aanpassen
aangepaste_rijen = []

for i in df.index:
    if df.loc[i, 'module_tags'] == 'Nog te bepalen':
        aangepaste_rijen.append(i)  # Onthoud deze rij!
        
        text = str(df.loc[i, 'shortdescription'])
        woorden = re.findall(r'\b\w+\b', text.lower())
        tags = [w for w in woorden if w not in dutch_stops and len(w) >= 3 and w not in ['nog', 'bepalen']]
        
        # Als geen tags, probeer name
        if not tags:
            name_text = str(df.loc[i, 'name'])
            name_woorden = re.findall(r'\b\w+\b', name_text.lower())
            tags = [w for w in name_woorden if w not in dutch_stops and len(w) >= 3 and w not in ['nog', 'bepalen']]
        
        df.loc[i, 'module_tags'] = str(tags) if tags else "['algemeen']"

ntb_na = (df['module_tags'] == 'Nog te bepalen').sum()
print(f"Na vullen module_tags 'Nog te bepalen': {ntb_na}")
print(f"Aantal aangepaste rijen: {len(aangepaste_rijen)}")

Vooraf module_tags 'Nog te bepalen': 30
Na vullen module_tags 'Nog te bepalen': 0
Aantal aangepaste rijen: 30


## Export en samenvatting

In [42]:
output_file = 'Opgeschoonde_VKM_dataset.csv'
df.to_csv(output_file, index=False, encoding='utf-8')

print("Resterende 'Nog te bepalen':")
for col in df.columns:
    if df[col].dtype == 'object':
        ntb_count = (df[col] == 'Nog te bepalen').sum()
        if ntb_count > 0:
            perc = round((ntb_count / len(df)) * 100, 1)
            print(f"  {col}: {ntb_count} ({perc}%)")

Resterende 'Nog te bepalen':
  description: 4 (1.9%)
  content: 4 (1.9%)
  learningoutcomes: 55 (26.1%)
