In [1]:
import os
import re
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import shap

# Helper function to extract data from text files
def load_data(file_pattern, base_path):
    data = []
    record_ids = []
    for root, _, files in os.walk(base_path):
        for file in files:
            if re.match(file_pattern, file):
                with open(os.path.join(root, file), "r", encoding="utf-8") as f:
                    data.append(f.read())
                match = re.search(r'(Process-rec-\d{3})', file)
                if match:
                    record_ids.append(match.group(1))
    return data, record_ids

# Helper function to load labels
def load_labels(label_file):
    df = pd.read_csv(label_file)
    return df.set_index('Record-ID')[['Class', 'Age', 'Converted-MMSE']]

# Feature engineering: Calculate text and timing features
def count_interrupters_and_clean_text(text):
    interrupter_count = len(re.findall(r'\(.*?\)', text))  # Count interruptions in parentheses
    cleaned_text = re.sub(r'\(.*?\)', '', text)  # Remove parentheses content
    return interrupter_count, cleaned_text

def find_repetitions_and_timing(text):
    interrupter_count, cleaned_text = count_interrupters_and_clean_text(text)
    words = re.findall(r'\b\w+\b', cleaned_text)
    stutter_words = len(re.findall(r'\b(er|um|oh|ah|mm)\b', cleaned_text))
    meaningful_words = len(words) - stutter_words
    unique_words = len(set(words))
    word_count = len(words)
    return {
        'word_count': word_count,
        'stutter_words': stutter_words,
        'meaningful_words': meaningful_words,
        'unique_words': unique_words,
        'interrupter_count': interrupter_count,
    }

def extract_features(texts):
    return pd.DataFrame([find_repetitions_and_timing(text) for text in texts])

# Load and combine data
base_path = "../process/PROCESS-V1/"
file_pattern = r"Process-rec-\d+__CTD\.txt"
data, record_ids = load_data(file_pattern, base_path)

label_file = "../process/PROCESS-V1/dem-info-filled-mmse-score.csv"
labels = load_labels(label_file)

combined_data = []
for text, record_id in zip(data, record_ids):
    if record_id in labels.index:
        row = labels.loc[record_id]
        combined_data.append({
            'Record-ID': record_id,
            'Text': text,
            'Class': row['Class'],
            'Age': row['Age'],
            'Converted-MMSE': row['Converted-MMSE'],
        })

df = pd.DataFrame(combined_data)

# Entferne fehlerhafte Daten
df = df[df['Record-ID'] != 'Process-rec-071'].reset_index(drop=True)

# Entferne fehlende Daten
df.dropna(inplace=True)

# Zielvariable und Features
X_text = df['Text']
X_numeric = df[['Age']]
y = df['Converted-MMSE']

# Split data
X_train_text, X_test_text, X_train_numeric, X_test_numeric, y_train, y_test = train_test_split(
    X_text, X_numeric, y, test_size=0.2, random_state=42
)

# Word Embeddings with TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

# Train Word2Vec model
sentences = [text.split() for text in X_train_text]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4, seed=42)

def text_to_embedding(text, model):
    words = text.split()
    embeddings = [model.wv[word] for word in words if word in model.wv]
    if embeddings:
        return pd.Series(embeddings).mean(axis=0)
    else:
        return pd.Series([0] * model.vector_size)

X_train_text_features = pd.DataFrame([text_to_embedding(text, word2vec_model) for text in X_train_text])
X_test_text_features = pd.DataFrame([text_to_embedding(text, word2vec_model) for text in X_test_text])

# Extract calculated features
X_train_calculated_features = extract_features(X_train_text).reset_index(drop=True)
X_test_calculated_features = extract_features(X_test_text).reset_index(drop=True)

# Combine features
X_train_combined = pd.concat([
    X_train_text_features,
    X_train_calculated_features,
    X_train_numeric.reset_index(drop=True)
], axis=1)

X_test_combined = pd.concat([
    X_test_text_features,
    X_test_calculated_features,
    X_test_numeric.reset_index(drop=True)
], axis=1)

# Ensure column names are strings
X_train_combined.columns = X_train_combined.columns.astype(str)
X_test_combined.columns = X_test_combined.columns.astype(str)

# XGBoost with advanced hyperparameters
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('regressor', XGBRegressor(random_state=42))
])

param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [3, 5, 7],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__subsample': [0.8, 1.0],
    'regressor__colsample_bytree': [0.8, 1.0],
    'regressor__gamma': [0, 0.1, 0.2],
    'regressor__min_child_weight': [1, 5, 10],
    'regressor__reg_alpha': [0, 0.1, 1],
    'regressor__reg_lambda': [1, 1.5, 2]
}

grid_search = GridSearchCV(pipeline, param_grid, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train_combined, y_train)

# Best model evaluation
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Predictions
y_pred = best_model.predict(X_test_combined)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
explained_var = explained_variance_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R²): {r2}")
print(f"Explained Variance Score: {explained_var}")


Fitting 5 folds for each of 8748 candidates, totalling 43740 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'regressor__colsample_bytree': 1.0, 'regressor__gamma': 0, 'regressor__learning_rate': 0.01, 'regressor__max_depth': 3, 'regressor__min_child_weight': 10, 'regressor__n_estimators': 200, 'regressor__reg_alpha': 0.1, 'regressor__reg_lambda': 1, 'regressor__subsample': 0.8}
Mean Squared Error (MSE): 4.729161210423399
Mean Absolute Error (MAE): 1.794454574584961
R-squared (R²): -0.12124590865329021
Explained Variance Score: -0.12107280819519284
