In [23]:
import os
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

# Helper function to extract data from text files
def load_data(file_pattern, base_path):
    data = []
    record_ids = []
    for root, _, files in os.walk(base_path):
        for file in files:
            if re.match(file_pattern, file):
                with open(os.path.join(root, file), "r", encoding="utf-8") as f:
                    data.append(f.read())
                match = re.search(r'(Process-rec-\d{3})', file)
                if match:
                    record_ids.append(match.group(1))
    return data, record_ids

# Helper function to load MMSE labels and demographic information
def load_labels(label_file):
    df = pd.read_csv(label_file)
    df['label'] = df['Class']  # Use 'Class' as the label
    return df.set_index('Record-ID')[['label', 'Age', 'Gender', 'Converted-MMSE']].to_dict(orient='index')

# Load data
base_path = "../process/PROCESS-V1/"
file_pattern = r"Process-rec-\d+__CTD\.txt"
data, record_ids = load_data(file_pattern, base_path)

label_file = "../process/PROCESS-V1/dem-info-filled-mmse-score.csv"
labels = load_labels(label_file)

# Combine text data with demographic data
combined_data = []
for text, record_id in zip(data, record_ids):
    if record_id in labels:
        label_data = labels[record_id]
        combined_data.append({
            'Record-ID': record_id,
            'Text': text,
            'Age': label_data['Age'],
            'Gender': label_data['Gender'],
            'MMSE-Score': label_data['Converted-MMSE'],
            'Class': label_data['label']
        })

df = pd.DataFrame(combined_data)

# Encode categorical variables
df['Gender'] = df['Gender'].map({'male': 0, 'female': 1})

# Drop rows with missing values
df.dropna(subset=['Text', 'Class', 'Age', 'Gender', 'MMSE-Score'], inplace=True)

# Ensure 'Text' column is string
df['Text'] = df['Text'].astype(str)

X = df[['Text', 'Age', 'Gender', 'MMSE-Score']]
y = df['Class']

# Encode the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(), 'Text'),
        ('numeric', MinMaxScaler(), ['Age', 'MMSE-Score']),
        ('gender', 'passthrough', ['Gender'])
    ]
)

# Transform data outside the loop
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# XGBoost model
model = XGBClassifier(random_state=42, eval_metric='logloss')

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Grid search parameter grid
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [3, 6, 10],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro', verbose=1)
grid_search.fit(X_train, y_train)

print("Beste Parameter:", grid_search.best_params_)

# Fit the model with best parameters
best_pipeline = grid_search.best_estimator_

# Predict on test data
y_pred = best_pipeline.predict(X_test)
y_pred_original = label_encoder.inverse_transform(y_pred)
y_test_original = label_encoder.inverse_transform(y_test)

print(classification_report(y_test_original, y_pred_original, zero_division=0))

# Confusion matrix for all folds
mean_cm = None
for fold_idx, (train_idx, test_idx) in enumerate(grid_search.cv.split(X_train_transformed, y_train)):
    # Select train and test folds
    X_train_fold, X_test_fold = X_train_transformed[train_idx], X_train_transformed[test_idx]
    y_train_fold, y_test_fold = y_train[train_idx], y_train[test_idx]

    # Fit the model for the current fold
    model.fit(X_train_fold, y_train_fold)
    y_pred_fold = model.predict(X_test_fold)

    cm = confusion_matrix(y_test_fold, y_pred_fold, labels=range(len(label_encoder.classes_)))
    if mean_cm is None:
        mean_cm = cm
    else:
        mean_cm += cm

mean_cm = mean_cm / grid_search.cv

# Plot averaged confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=mean_cm, display_labels=label_encoder.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title("Averaged Confusion Matrix (Across Folds)")
plt.show()

# Save predictions
predictions_df = pd.DataFrame({
    'Record-ID': df.iloc[X_test.index].reset_index(drop=True)['Record-ID'],
    'Text': X_test['Text'].reset_index(drop=True),
    'Age': X_test['Age'].reset_index(drop=True),
    'Gender': X_test['Gender'].reset_index(drop=True),
    'MMSE-Score': X_test['MMSE-Score'].reset_index(drop=True),
    'True Label': label_encoder.inverse_transform(y_test),
    'Predicted Label': label_encoder.inverse_transform(y_pred)
})

predictions_df.to_csv("predictions_with_xgboost.csv", index=False)
print("Predictions saved to 'predictions_with_xgboost.csv'.")

cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results.to_csv("cv_results.csv", index=False)
print("Cross-Validation results saved to 'cv_results.csv'.")


Fitting 5 folds for each of 54 candidates, totalling 270 fits
Beste Parameter: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 100, 'classifier__subsample': 1.0}
              precision    recall  f1-score   support

    Dementia       0.00      0.00      0.00         4
          HC       1.00      0.87      0.93        15
         MCI       0.68      1.00      0.81        13

    accuracy                           0.81        32
   macro avg       0.56      0.62      0.58        32
weighted avg       0.75      0.81      0.77        32



AttributeError: 'int' object has no attribute 'split'