In [6]:
!pip install --upgrade pip

Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 25.0
    Uninstalling pip-25.0:
      Successfully uninstalled pip-25.0
Successfully installed pip-25.0.1


In [8]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp313-cp313-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.2-cp313-cp313-macosx_14_0_arm64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp313-cp313-macosx_12_0_arm64.whl (11.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading scipy-1.15.2-cp313-cp313-macosx_14_0_arm64.whl (22.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.4/22.4 MB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Insta

In [9]:
import pandas as pd
from sklearn.model_selection import cross_validate, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier  # Example model, can be changed
import io

In [10]:
def train_and_evaluate(dataframe):
    """
    Trains a machine learning model on the provided dataframe, performs cross-validation,
    and evaluates performance metrics.

    Args:
        dataframe: pandas DataFrame.  The first row is skipped. The 'CLASS' column
                   in the second row indicates the target variable.

    Returns:
        None. Prints the cross-validation results and test set performance metrics.
    """

    # --- Data Preparation ---
    df = dataframe.iloc[1:].copy()  # Skip the first row


    # Identify target and feature columns
    target_column = None
    for col in df.columns:
        if 'CLASS' in str(df[col].iloc[0]).upper(): # convert to string and upper case for robustness
            target_column = col
            break  # Exit the loop once the target column is found

    if target_column is None:
        raise ValueError("No column with 'CLASS' found in the second row for the target variable.")
    
    X = df.drop(columns=target_column)
    y = df[target_column]

    # Convert string labels in y to numeric labels
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)


    # --- Model Training and Cross-Validation ---
    model = RandomForestClassifier(random_state=42)  # Use RandomForest, can be any scikit-learn model. random_state for reproducibility

    # Define cross-validation strategy (KFold is a good general choice)
    cv = KFold(n_splits=5, shuffle=True, random_state=42) # 5 splits, shuffle data, random_state for reproducibility

    # Define scoring metrics.  Use make_scorer for custom weighting if needed.
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision': make_scorer(precision_score, average='weighted', zero_division=0), # Handle potential zero division
        'recall': make_scorer(recall_score, average='weighted', zero_division=0),
        'f1': make_scorer(f1_score, average='weighted', zero_division=0),
    }


    # Perform cross-validation.  Return training scores as well for analysis.
    cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=True)

    # --- Cross-Validation Results ---
    print("Cross-Validation Results:")
    for metric in scoring.keys():
        print(f"  {metric}:")
        print(f"    Train: {cv_results[f'train_{metric}'].mean():.4f} +/- {cv_results[f'train_{metric}'].std():.4f}")  # Print mean and std dev
        print(f"    Test:  {cv_results[f'test_{metric}'].mean():.4f} +/- {cv_results[f'test_{metric}'].std():.4f}")

In [11]:
df = pd.read_csv('Datasets/combined_dataset.csv')

In [13]:
df

Unnamed: 0,NHY,CLASS,NP1COG,NP1HALL,NP1DPRS,NP1ANXS,NP1APAT,NP1DDS,NP1SLPN,NP1SLPD,...,STAIAD31,STAIAD32,STAIAD33,STAIAD34,STAIAD35,STAIAD36,STAIAD37,STAIAD38,STAIAD39,STAIAD40
0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,...,2.0,2.0,4.0,1.0,2.0,3.0,2.0,2.0,4.0,2.0
1,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,...,2.0,2.0,4.0,3.0,2.0,4.0,1.0,2.0,4.0,1.0
2,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,2.0,2.0,4.0,2.0,2.0,3.0,2.0,2.0,4.0,1.0
3,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,2.0,2.0,4.0,2.0,2.0,3.0,2.0,2.0,4.0,1.0
4,2.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,2.0,2.0,3.0,2.0,2.0,3.0,2.0,1.0,3.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,4.0,4.0,4.0,1.0,4.0,1.0,1.0,4.0,1.0
10010,2.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,4.0,3.0,2.0,3.0,1.0,3.0,3.0,1.0
10011,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,...,1.0,1.0,4.0,4.0,1.0,4.0,1.0,1.0,4.0,1.0
10012,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,...,2.0,2.0,3.0,2.0,2.0,3.0,2.0,2.0,3.0,2.0


In [14]:
train_and_evaluate(df)

ValueError: No column with 'CLASS' found in the second row for the target variable.