# Saudi Arabia Dataset Analysis with Cross-Dataset Validation

This notebook performs comprehensive machine learning analysis on the Saudi Arabia autism screening dataset and validates the best model on:
1. **Polish Dataset** - selecting 10 out of 25 Q-CHAT questions (group labels: 1/7 for ASD)
2. **Bangladesh Dataset** - using the existing SVM child model for validation

## Objectives
- Train multiple ML models on Saudi Arabia toddler dataset
- Identify the best performing model
- Validate best model on Polish dataset with feature mapping
- Validate SVM child model on Bangladesh/Real World Child dataset
- Compare cross-dataset performance and generalization capabilities

In [1]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (classification_report, confusion_matrix, accuracy_score, 
                             precision_score, recall_score, roc_auc_score, roc_curve,
                             f1_score)
from sklearn.impute import SimpleImputer
import joblib
import warnings
import time
from pathlib import Path

# Configure settings
warnings.filterwarnings('ignore')
np.random.seed(42)
plt.style.use('default')

# Set display options for pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

print("All libraries imported successfully!")
print(f"Working directory: {Path.cwd()}")
print(f"Current time: {pd.Timestamp.now()}")

All libraries imported successfully!
Working directory: e:\Users\Prajj\Documents\7th Sem\RM\Codes
Current time: 2025-09-23 17:51:42.946675


## Load and Explore Saudi Arabia Dataset

Load the Saudi Arabia toddler autism screening dataset and examine its structure.

In [2]:
# Load Saudi Arabia Dataset
saudi_dataset_path = r'e:\Users\Prajj\Documents\7th Sem\RM\Datasets\Saudi Dataset Toddlers\Autism Spectrum Disorder Screening Data for Toddlers in Saudi Arabia Data Set.csv'

try:
    # Load the dataset
    df_saudi = pd.read_csv(saudi_dataset_path)
    print("‚úÖ Saudi Arabia dataset loaded successfully!")
    print(f"Dataset shape: {df_saudi.shape}")
    
    # Display basic information
    print("\n" + "="*80)
    print("SAUDI ARABIA DATASET OVERVIEW")
    print("="*80)
    
    print(f"\nDataset dimensions: {df_saudi.shape[0]} rows √ó {df_saudi.shape[1]} columns")
    
    print("\nColumn names:")
    for i, col in enumerate(df_saudi.columns, 1):
        print(f"{i:2d}. {col}")
    
    print("\nFirst few rows:")
    display(df_saudi.head())
    
    print("\nDataset info:")
    print(df_saudi.info())
    
    print("\nBasic statistics:")
    display(df_saudi.describe())
    
    # Check for missing values
    print("\nMissing values:")
    missing_values = df_saudi.isnull().sum()
    if missing_values.sum() > 0:
        print(missing_values[missing_values > 0])
    else:
        print("No missing values found!")
    
    # Analyze target variable
    target_candidates = ['Class/ASD', 'ASD', 'autism', 'classification', 'target']
    target_col = None
    
    for col in df_saudi.columns:
        if any(candidate.lower() in col.lower() for candidate in target_candidates):
            target_col = col
            break
    
    if target_col:
        print(f"\nTarget variable '{target_col}' distribution:")
        target_counts = df_saudi[target_col].value_counts()
        print(target_counts)
        
        print(f"\nTarget variable percentages:")
        target_percentages = df_saudi[target_col].value_counts(normalize=True) * 100
        for value, percentage in target_percentages.items():
            print(f"{value}: {percentage:.2f}%")
    else:
        print("\n‚ö†Ô∏è Warning: Target column not clearly identified")
        print("Last few columns (potential targets):", list(df_saudi.columns[-3:]))
    
    # Identify feature columns (A1-A10 scores)
    feature_cols = [col for col in df_saudi.columns if 'A' in col and 'Score' in col]
    if not feature_cols:
        # Alternative patterns
        feature_cols = [col for col in df_saudi.columns if col.startswith('A') and any(char.isdigit() for char in col)]
    
    print(f"\nIdentified feature columns ({len(feature_cols)}):")
    for col in feature_cols:
        print(f"  - {col}")
        if len(feature_cols) <= 10:  # Show distribution for small number of features
            print(f"    Distribution: {df_saudi[col].value_counts().to_dict()}")
    
except FileNotFoundError:
    print(f"‚ùå Error: Dataset file not found at {saudi_dataset_path}")
    print("Please check the file path.")
except Exception as e:
    print(f"‚ùå Error loading dataset: {e}")

‚úÖ Saudi Arabia dataset loaded successfully!
Dataset shape: (506, 17)

SAUDI ARABIA DATASET OVERVIEW

Dataset dimensions: 506 rows √ó 17 columns

Column names:
 1. A10
 2. A9
 3. A8
 4. A7
 5. A6
 6. A5
 7. A4
 8. A3
 9. A2
10. A1
11. Region
12. Family member with ASD history
13. Who is completing the test
14. Age
15. Gender
16. Screening Score
17. Class

First few rows:


Unnamed: 0,A10,A9,A8,A7,A6,A5,A4,A3,A2,A1,Region,Family member with ASD history,Who is completing the test,Age,Gender,Screening Score,Class
0,0,0,1,1,1,0,0,0,0,0,Makkah Province,No,Family member,32,Female,3,0
1,0,0,1,0,0,1,0,1,0,0,Makkah Province,No,Family member,30,Female,3,0
2,0,0,0,1,0,0,0,0,0,0,Makkah Province,No,Family member,36,Male,1,0
3,0,0,0,0,0,0,0,0,0,0,Makkah Province,Yes,Family member,36,Female,0,0
4,0,0,0,0,0,0,0,0,0,0,Eastern Province,No,Family member,36,Female,0,0



Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   A10                             506 non-null    int64 
 1   A9                              506 non-null    int64 
 2   A8                              506 non-null    int64 
 3   A7                              506 non-null    int64 
 4   A6                              506 non-null    int64 
 5   A5                              506 non-null    int64 
 6   A4                              506 non-null    int64 
 7   A3                              506 non-null    int64 
 8   A2                              506 non-null    int64 
 9   A1                              506 non-null    int64 
 10  Region                          506 non-null    object
 11  Family member with ASD history  506 non-null    object
 12  Who is completing the test      506

Unnamed: 0,A10,A9,A8,A7,A6,A5,A4,A3,A2,A1,Age,Screening Score,Class
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,0.620553,0.543478,0.575099,0.55336,0.549407,0.559289,0.525692,0.511858,0.488142,0.563241,24.448617,5.490119,0.673913
std,0.48573,0.498599,0.494817,0.497637,0.498045,0.496964,0.499834,0.500354,0.500354,0.496475,8.344461,3.181771,0.469243
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,3.0,0.0
50%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,24.0,6.0,1.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,33.0,8.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,36.0,10.0,1.0



Missing values:
No missing values found!

Target variable 'Family member with ASD history' distribution:
Family member with ASD history
No     384
Yes    122
Name: count, dtype: int64

Target variable percentages:
No: 75.89%
Yes: 24.11%

Identified feature columns (10):
  - A10
    Distribution: {1: 314, 0: 192}
  - A9
    Distribution: {1: 275, 0: 231}
  - A8
    Distribution: {1: 291, 0: 215}
  - A7
    Distribution: {1: 280, 0: 226}
  - A6
    Distribution: {1: 278, 0: 228}
  - A5
    Distribution: {1: 283, 0: 223}
  - A4
    Distribution: {1: 266, 0: 240}
  - A3
    Distribution: {1: 259, 0: 247}
  - A2
    Distribution: {0: 259, 1: 247}
  - A1
    Distribution: {1: 285, 0: 221}


In [3]:
# Preprocess Saudi Arabia Dataset
def preprocess_saudi_dataset(df):
    """
    Comprehensive preprocessing for the Saudi Arabia autism dataset
    """
    print("="*60)
    print("PREPROCESSING SAUDI ARABIA DATASET")
    print("="*60)
    
    df_processed = df.copy()
    print(f"Original dataset shape: {df_processed.shape}")
    
    # Handle missing values
    print(f"\nHandling missing values...")
    missing_before = df_processed.isnull().sum().sum()
    print(f"Missing values before preprocessing: {missing_before}")
    
    # Fill missing values for numerical columns
    numerical_cols = df_processed.select_dtypes(include=[np.number]).columns.tolist()
    for col in numerical_cols:
        if df_processed[col].isnull().sum() > 0:
            if 'age' in col.lower():
                # For age, use median
                median_value = df_processed[col].median()
                df_processed[col].fillna(median_value, inplace=True)
                print(f"  Filled {col} missing values with median: {median_value}")
            else:
                # For other numerical columns, use mode or median
                mode_val = df_processed[col].mode()
                if len(mode_val) > 0:
                    df_processed[col].fillna(mode_val[0], inplace=True)
                    print(f"  Filled {col} missing values with mode: {mode_val[0]}")
    
    # Fill missing values for categorical columns
    categorical_cols = df_processed.select_dtypes(include=['object']).columns.tolist()
    for col in categorical_cols:
        if df_processed[col].isnull().sum() > 0:
            mode_value = df_processed[col].mode()[0] if not df_processed[col].mode().empty else 'Unknown'
            df_processed[col].fillna(mode_value, inplace=True)
            print(f"  Filled {col} missing values with mode: {mode_value}")
    
    # Encode categorical variables
    label_encoders = {}
    for col in categorical_cols:
        if col not in ['Class/ASD', 'ASD']:  # Don't encode target yet
            le = LabelEncoder()
            df_processed[col] = le.fit_transform(df_processed[col].astype(str))
            label_encoders[col] = le
            print(f"  Encoded {col}: {len(le.classes_)} unique values")
    
    # Handle target variable
    target_col = None
    for col in df_processed.columns:
        if any(target.lower() in col.lower() for target in ['class', 'asd', 'autism']):
            target_col = col
            break
    
    if target_col:
        print(f"\nTarget variable preprocessing...")
        print(f"Original target '{target_col}' values: {sorted(df_processed[target_col].unique())}")
        
        # Encode target variable
        target_encoder = LabelEncoder()
        df_processed[target_col] = target_encoder.fit_transform(df_processed[target_col])
        print(f"Encoded target values: {sorted(df_processed[target_col].unique())}")
        print(f"Mapping: {dict(zip(target_encoder.classes_, target_encoder.transform(target_encoder.classes_)))}")
    else:
        print("‚ö†Ô∏è Warning: Target column not found!")
        target_encoder = None
        # Assume last column is target
        target_col = df_processed.columns[-1]
        print(f"Using last column as target: {target_col}")
    
    missing_after = df_processed.isnull().sum().sum()
    print(f"Missing values after preprocessing: {missing_after}")
    print(f"Processed dataset shape: {df_processed.shape}")
    
    return df_processed, label_encoders, target_encoder, target_col

# Apply preprocessing
if 'df_saudi' in locals():
    df_saudi_processed, saudi_label_encoders, saudi_target_encoder, saudi_target_col = preprocess_saudi_dataset(df_saudi)
    
    # Display processed dataset info
    print(f"\n" + "="*60)
    print("PROCESSED SAUDI DATASET SUMMARY")
    print("="*60)
    
    print(f"\nProcessed dataset info:")
    print(df_saudi_processed.info())
    
    print(f"\nTarget distribution after preprocessing:")
    if saudi_target_col:
        target_dist = df_saudi_processed[saudi_target_col].value_counts()
        print(target_dist)
    
else:
    print("‚ö†Ô∏è Please run the data loading cell first!")

PREPROCESSING SAUDI ARABIA DATASET
Original dataset shape: (506, 17)

Handling missing values...
Missing values before preprocessing: 0
  Encoded Region: 13 unique values
  Encoded Family member with ASD history: 2 unique values
  Encoded Who is completing the test: 2 unique values
  Encoded Gender: 2 unique values

Target variable preprocessing...
Original target 'Family member with ASD history' values: [np.int64(0), np.int64(1)]
Encoded target values: [np.int64(0), np.int64(1)]
Mapping: {np.int64(0): np.int64(0), np.int64(1): np.int64(1)}
Missing values after preprocessing: 0
Processed dataset shape: (506, 17)

PROCESSED SAUDI DATASET SUMMARY

Processed dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype
---  ------                          --------------  -----
 0   A10                             506 non-null    int64
 1   A9                              506

## Train Multiple Models on Saudi Arabia Dataset

Train various machine learning models on the Saudi Arabia dataset to find the best performing classifier.

In [4]:
# Define machine learning models for Saudi Arabia dataset
models_dict = {
    'SVM (RBF)': SVC(kernel='rbf', random_state=42, probability=True, C=1.0),
    'SVM (Linear)': SVC(kernel='linear', random_state=42, probability=True, C=1.0),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42, n_estimators=100),
    'Naive Bayes': GaussianNB(),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=10)
}

# Utility functions for model evaluation
def calculate_specificity(y_true, y_pred):
    """Calculate specificity (True Negative Rate)"""
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp) if (tn + fp) > 0 else 0.0

def evaluate_model_comprehensive(model, X_train, X_test, y_train, y_test, model_name):
    """
    Comprehensive model evaluation with all metrics
    """
    start_time = time.time()
    
    # Train the model
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred, average='binary', zero_division=0)
    recall = recall_score(y_test, y_test_pred, average='binary', zero_division=0)
    specificity = calculate_specificity(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred, average='binary', zero_division=0)
    
    # Calculate AUC if possible
    try:
        y_test_proba = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_test_proba)
    except:
        auc = 0.0
    
    # Cross-validation
    try:
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
        cv_mean = cv_scores.mean()
        cv_std = cv_scores.std()
    except:
        cv_mean = 0.0
        cv_std = 0.0
    
    return {
        'Model': model_name,
        'Train_Accuracy': train_accuracy,
        'Test_Accuracy': test_accuracy,
        'Precision': precision,
        'Recall': recall,
        'Specificity': specificity,
        'F1_Score': f1,
        'AUC': auc,
        'CV_Mean': cv_mean,
        'CV_Std': cv_std,
        'Training_Time': training_time
    }

print("="*60)
print("MACHINE LEARNING MODELS CONFIGURATION")
print("="*60)

print(f"Total models to train: {len(models_dict)}")
print("\nModels configured:")
for i, (name, model) in enumerate(models_dict.items(), 1):
    print(f"{i:2d}. {name}")

print(f"\n‚úÖ Models and evaluation functions defined successfully!")

MACHINE LEARNING MODELS CONFIGURATION
Total models to train: 8

Models configured:
 1. SVM (RBF)
 2. SVM (Linear)
 3. Random Forest
 4. Logistic Regression
 5. Gradient Boosting
 6. Naive Bayes
 7. K-Nearest Neighbors
 8. Decision Tree

‚úÖ Models and evaluation functions defined successfully!


In [5]:
# Prepare Saudi Arabia dataset for training
if 'df_saudi_processed' in locals() and saudi_target_col:
    print("="*80)
    print("PREPARING SAUDI ARABIA DATASET FOR TRAINING")
    print("="*80)
    
    # Identify feature columns
    exclude_cols = [saudi_target_col, 'Case_No', 'ID', 'case_no', 'id']
    feature_columns = [col for col in df_saudi_processed.columns 
                      if col not in exclude_cols and not any(ex.lower() in col.lower() for ex in exclude_cols)]
    
    print(f"Total features available: {len(feature_columns)}")
    print(f"Target column: {saudi_target_col}")
    print(f"Feature columns: {feature_columns}")
    
    # Prepare X (features) and y (target)
    X_saudi = df_saudi_processed[feature_columns]
    y_saudi = df_saudi_processed[saudi_target_col]
    
    print(f"\nDataset preparation:")
    print(f"  Feature matrix shape: {X_saudi.shape}")
    print(f"  Target vector shape: {y_saudi.shape}")
    print(f"  Target distribution: {np.bincount(y_saudi)}")
    
    # Split the data
    X_train_saudi, X_test_saudi, y_train_saudi, y_test_saudi = train_test_split(
        X_saudi, y_saudi, test_size=0.2, random_state=42, stratify=y_saudi
    )
    
    print(f"\nData split:")
    print(f"  Training set: {X_train_saudi.shape[0]} samples")
    print(f"  Test set: {X_test_saudi.shape[0]} samples")
    print(f"  Training target distribution: {np.bincount(y_train_saudi)}")
    print(f"  Test target distribution: {np.bincount(y_test_saudi)}")
    
    # Scale the features
    scaler_saudi = StandardScaler()
    X_train_saudi_scaled = scaler_saudi.fit_transform(X_train_saudi)
    X_test_saudi_scaled = scaler_saudi.transform(X_test_saudi)
    
    print(f"\nFeature scaling completed:")
    print(f"  Training features shape: {X_train_saudi_scaled.shape}")
    print(f"  Test features shape: {X_test_saudi_scaled.shape}")
    
    print(f"\n‚úÖ Saudi Arabia dataset preparation completed!")
    
else:
    print("‚ö†Ô∏è Please run the preprocessing step first!")

PREPARING SAUDI ARABIA DATASET FOR TRAINING
Total features available: 16
Target column: Family member with ASD history
Feature columns: ['A10', 'A9', 'A8', 'A7', 'A6', 'A5', 'A4', 'A3', 'A2', 'A1', 'Region', 'Who is completing the test', 'Age', 'Gender', 'Screening Score', 'Class']

Dataset preparation:
  Feature matrix shape: (506, 16)
  Target vector shape: (506,)
  Target distribution: [384 122]

Data split:
  Training set: 404 samples
  Test set: 102 samples
  Training target distribution: [307  97]
  Test target distribution: [77 25]

Feature scaling completed:
  Training features shape: (404, 16)
  Test features shape: (102, 16)

‚úÖ Saudi Arabia dataset preparation completed!


In [6]:
# Train all models on Saudi Arabia dataset
if 'X_train_saudi_scaled' in locals():
    print("="*80)
    print("TRAINING MODELS ON SAUDI ARABIA DATASET")
    print("="*80)
    
    saudi_results = []
    trained_saudi_models = {}
    
    print(f"Dataset: Saudi Arabia Toddler Autism Screening")
    print(f"Training samples: {len(X_train_saudi_scaled)}")
    print(f"Test samples: {len(X_test_saudi_scaled)}")
    print(f"Features: {X_train_saudi_scaled.shape[1]}")
    
    print(f"\n{'Model':<20} {'Train_Acc':<10} {'Test_Acc':<10} {'Precision':<10} {'Recall':<10} {'F1':<8} {'AUC':<8} {'Time':<8}")
    print("-" * 90)
    
    for model_name, model in models_dict.items():
        try:
            print(f"{model_name:<20}", end="", flush=True)
            
            # Evaluate the model
            result = evaluate_model_comprehensive(
                model, X_train_saudi_scaled, X_test_saudi_scaled, 
                y_train_saudi, y_test_saudi, model_name
            )
            
            # Store results
            saudi_results.append(result)
            trained_saudi_models[model_name] = model
            
            # Print summary
            print(f" {result['Train_Accuracy']:<9.4f} {result['Test_Accuracy']:<9.4f} "
                  f"{result['Precision']:<9.4f} {result['Recall']:<9.4f} "
                  f"{result['F1_Score']:<7.4f} {result['AUC']:<7.4f} {result['Training_Time']:<7.2f}s")
            
        except Exception as e:
            print(f" ERROR: {e}")
    
    # Create results DataFrame
    saudi_results_df = pd.DataFrame(saudi_results)
    
    print(f"\n{'='*80}")
    print("SAUDI ARABIA DATASET - DETAILED RESULTS")
    print(f"{'='*80}")
    
    if not saudi_results_df.empty:
        display(saudi_results_df.round(4))
        
        # Find best model
        best_saudi_model_idx = saudi_results_df['Test_Accuracy'].idxmax()
        best_saudi_model_result = saudi_results_df.loc[best_saudi_model_idx]
        best_saudi_model_name = best_saudi_model_result['Model']
        best_saudi_model = trained_saudi_models[best_saudi_model_name]
        
        print(f"\nüèÜ BEST MODEL FOR SAUDI ARABIA DATASET:")
        print(f"   Model: {best_saudi_model_name}")
        print(f"   Test Accuracy: {best_saudi_model_result['Test_Accuracy']:.4f}")
        print(f"   Precision: {best_saudi_model_result['Precision']:.4f}")
        print(f"   Recall: {best_saudi_model_result['Recall']:.4f}")
        print(f"   F1-Score: {best_saudi_model_result['F1_Score']:.4f}")
        print(f"   AUC: {best_saudi_model_result['AUC']:.4f}")
        print(f"   Cross-Val Accuracy: {best_saudi_model_result['CV_Mean']:.4f} ¬± {best_saudi_model_result['CV_Std']:.4f}")
        
        # Save the best model
        best_model_path = r'e:\Users\Prajj\Documents\7th Sem\RM\Codes\best_saudi_model.pkl'
        scaler_path = r'e:\Users\Prajj\Documents\7th Sem\RM\Codes\saudi_scaler.pkl'
        
        joblib.dump(best_saudi_model, best_model_path)
        joblib.dump(scaler_saudi, scaler_path)
        
        print(f"\n‚úÖ Best model saved to: {best_model_path}")
        print(f"‚úÖ Scaler saved to: {scaler_path}")
        
    else:
        print("No results to display")
        
else:
    print("‚ö†Ô∏è Please run the data preparation step first!")

TRAINING MODELS ON SAUDI ARABIA DATASET
Dataset: Saudi Arabia Toddler Autism Screening
Training samples: 404
Test samples: 102
Features: 16

Model                Train_Acc  Test_Acc   Precision  Recall     F1       AUC      Time    
------------------------------------------------------------------------------------------
SVM (RBF)            0.7896    0.7549    0.0000    0.0000    0.0000  0.5455  0.03   s
SVM (Linear)         0.7599    0.7549    0.0000    0.0000    0.0000  0.6000  0.01   s
Random Forest        0.9653    0.7353    0.4000    0.1600    0.2286  0.6229  0.11   s
Logistic Regression  0.7649    0.7157    0.0000    0.0000    0.0000  0.5668  0.01   s
Gradient Boosting    0.9059    0.6961    0.2500    0.1200    0.1622  0.5956  0.08   s
Naive Bayes          0.6262    0.5686    0.3208    0.6800    0.4359  0.6390  0.00   s
K-Nearest Neighbors  0.7921    0.7451    0.4444    0.1600    0.2353  0.5974  0.00   s
Decision Tree        0.9505    0.6471    0.3226    0.4000    0.3571  0.550

Unnamed: 0,Model,Train_Accuracy,Test_Accuracy,Precision,Recall,Specificity,F1_Score,AUC,CV_Mean,CV_Std,Training_Time
0,SVM (RBF),0.7896,0.7549,0.0,0.0,1.0,0.0,0.5455,0.7649,0.0105,0.0255
1,SVM (Linear),0.7599,0.7549,0.0,0.0,1.0,0.0,0.6,0.7599,0.0057,0.0108
2,Random Forest,0.9653,0.7353,0.4,0.16,0.9221,0.2286,0.6229,0.7426,0.0054,0.1123
3,Logistic Regression,0.7649,0.7157,0.0,0.0,0.9481,0.0,0.5668,0.7599,0.0057,0.0144
4,Gradient Boosting,0.9059,0.6961,0.25,0.12,0.8831,0.1622,0.5956,0.6906,0.0377,0.082
5,Naive Bayes,0.6262,0.5686,0.3208,0.68,0.5325,0.4359,0.639,0.6238,0.0199,0.001
6,K-Nearest Neighbors,0.7921,0.7451,0.4444,0.16,0.9351,0.2353,0.5974,0.7055,0.022,0.001
7,Decision Tree,0.9505,0.6471,0.3226,0.4,0.7273,0.3571,0.5501,0.6733,0.0183,0.002



üèÜ BEST MODEL FOR SAUDI ARABIA DATASET:
   Model: SVM (RBF)
   Test Accuracy: 0.7549
   Precision: 0.0000
   Recall: 0.0000
   F1-Score: 0.0000
   AUC: 0.5455
   Cross-Val Accuracy: 0.7649 ¬± 0.0105

‚úÖ Best model saved to: e:\Users\Prajj\Documents\7th Sem\RM\Codes\best_saudi_model.pkl
‚úÖ Scaler saved to: e:\Users\Prajj\Documents\7th Sem\RM\Codes\saudi_scaler.pkl


## Validate Best Model on Polish Dataset

Load the Polish dataset, select 10 most relevant Q-CHAT questions, and validate the best Saudi model.

In [7]:
# Load and preprocess Polish dataset for validation
polish_dataset_path = r'e:\Users\Prajj\Documents\7th Sem\RM\Datasets\Polish Dataset\Polish Dataset.csv'

try:
    # Load Polish dataset
    df_polish = pd.read_csv(polish_dataset_path)
    print("‚úÖ Polish dataset loaded successfully!")
    print(f"Polish dataset shape: {df_polish.shape}")
    
    print("\n" + "="*60)
    print("POLISH DATASET ANALYSIS FOR VALIDATION")
    print("="*60)
    
    print(f"\nPolish dataset columns:")
    for i, col in enumerate(df_polish.columns, 1):
        print(f"{i:2d}. {col}")
    
    # Identify Q-CHAT columns (should be 25 questions)
    qchat_columns = [col for col in df_polish.columns if 'qchat' in col.lower()]
    print(f"\nFound {len(qchat_columns)} Q-CHAT columns:")
    for i, col in enumerate(qchat_columns, 1):
        print(f"{i:2d}. {col}")
    
    # Analyze target variable (group: 1 or 7)
    if 'group' in df_polish.columns:
        print(f"\nTarget variable 'group' distribution:")
        group_counts = df_polish['group'].value_counts().sort_index()
        print(group_counts)
        print(f"Group 1: {group_counts.get(1, 0)} samples, Group 7: {group_counts.get(7, 0)} samples")
        
        # Map group to binary (1=non-ASD, 7=ASD -> 0=non-ASD, 1=ASD)
        df_polish['binary_target'] = (df_polish['group'] == 7).astype(int)
        print(f"Binary target distribution: {df_polish['binary_target'].value_counts().to_dict()}")
    else:
        print("‚ö†Ô∏è Warning: 'group' column not found!")
    
    # Select 10 most relevant Q-CHAT questions for validation
    # We'll select based on correlation with target and clinical relevance
    if len(qchat_columns) >= 10 and 'binary_target' in df_polish.columns:
        print(f"\nSelecting 10 most relevant Q-CHAT questions...")
        
        # Calculate correlation of each Q-CHAT question with target
        correlations = {}
        for col in qchat_columns:
            try:
                corr = abs(np.corrcoef(df_polish[col].fillna(0), df_polish['binary_target'])[0, 1])
                correlations[col] = corr
            except:
                correlations[col] = 0.0
        
        # Sort by correlation and select top 10
        sorted_qchat = sorted(correlations.items(), key=lambda x: x[1], reverse=True)
        selected_qchat_10 = [col for col, corr in sorted_qchat[:10]]
        
        print(f"Selected 10 Q-CHAT questions (by correlation with target):")
        for i, (col, corr) in enumerate(sorted_qchat[:10], 1):
            print(f"{i:2d}. {col}: correlation = {corr:.4f}")
        
        # Prepare Polish validation data
        X_polish_validation = df_polish[selected_qchat_10].fillna(0)  # Fill missing with 0
        y_polish_validation = df_polish['binary_target']
        
        print(f"\nPolish validation data prepared:")
        print(f"  Features shape: {X_polish_validation.shape}")
        print(f"  Target shape: {y_polish_validation.shape}")
        print(f"  Target distribution: {np.bincount(y_polish_validation)}")
        
        # Scale Polish features using Saudi scaler (if available)
        if 'scaler_saudi' in locals():
            # We need to match feature dimensions - pad or truncate as needed
            saudi_features = X_train_saudi_scaled.shape[1]
            polish_features = X_polish_validation.shape[1]
            
            if polish_features < saudi_features:
                # Pad with zeros if Polish has fewer features
                padding = np.zeros((X_polish_validation.shape[0], saudi_features - polish_features))
                X_polish_padded = np.concatenate([X_polish_validation.values, padding], axis=1)
                print(f"  Padded Polish features from {polish_features} to {saudi_features}")
            elif polish_features > saudi_features:
                # Truncate if Polish has more features
                X_polish_padded = X_polish_validation.values[:, :saudi_features]
                print(f"  Truncated Polish features from {polish_features} to {saudi_features}")
            else:
                X_polish_padded = X_polish_validation.values
                print(f"  Feature dimensions match: {polish_features}")
            
            # Scale using Saudi scaler
            X_polish_scaled = scaler_saudi.transform(X_polish_padded)
            print(f"  Polish features scaled using Saudi scaler")
            
        else:
            # Use separate scaler for Polish data
            polish_scaler = StandardScaler()
            X_polish_scaled = polish_scaler.fit_transform(X_polish_validation)
            print(f"  Polish features scaled using separate scaler")
        
        print(f"\n‚úÖ Polish dataset preparation for validation completed!")
        
    else:
        print(f"‚ö†Ô∏è Insufficient Q-CHAT columns or missing target. Found {len(qchat_columns)} Q-CHAT columns.")
        
except FileNotFoundError:
    print(f"‚ùå Error: Polish dataset file not found at {polish_dataset_path}")
except Exception as e:
    print(f"‚ùå Error processing Polish dataset: {e}")

‚úÖ Polish dataset loaded successfully!
Polish dataset shape: (252, 36)

POLISH DATASET ANALYSIS FOR VALIDATION

Polish dataset columns:
 1. child_id
 2. age
 3. sex
 4. group
 5. preterm
 6. birthweight
 7. siblings_yesno
 8. siblings_number
 9. mothers_education
10. sibling_withASD
11. Sum_QCHAT
12. qchat1recode
13. qchat2recode
14. qchat3recode
15. qchat4recode
16. qchat5recode
17. qchat6recode
18. qchat7recode
19. qchat8recode
20. qchat9recode
21. qchat10recode
22. qchat11recode
23. qchat12recode
24. qchat13recode
25. qchat14recode
26. qchat15recode
27. qchat16recode
28. qchat17recode
29. qchat18recode
30. qchat19recode
31. qchat20recode
32. qchat21recode
33. qchat22recode
34. qchat23recode
35. qchat24recode
36. qchat25recode

Found 26 Q-CHAT columns:
 1. Sum_QCHAT
 2. qchat1recode
 3. qchat2recode
 4. qchat3recode
 5. qchat4recode
 6. qchat5recode
 7. qchat6recode
 8. qchat7recode
 9. qchat8recode
10. qchat9recode
11. qchat10recode
12. qchat11recode
13. qchat12recode
14. qchat13re

In [8]:
# Validate best Saudi model on Polish dataset
if 'best_saudi_model' in locals() and 'X_polish_scaled' in locals():
    print("="*80)
    print("VALIDATING BEST SAUDI MODEL ON POLISH DATASET")
    print("="*80)
    
    print(f"Best Saudi model: {best_saudi_model_name}")
    print(f"Polish validation samples: {len(y_polish_validation)}")
    print(f"Polish target distribution: {np.bincount(y_polish_validation)}")
    
    try:
        # Make predictions on Polish dataset
        y_polish_pred = best_saudi_model.predict(X_polish_scaled)
        
        # Calculate performance metrics
        polish_accuracy = accuracy_score(y_polish_validation, y_polish_pred)
        polish_precision = precision_score(y_polish_validation, y_polish_pred, average='binary', zero_division=0)
        polish_recall = recall_score(y_polish_validation, y_polish_pred, average='binary', zero_division=0)
        polish_specificity = calculate_specificity(y_polish_validation, y_polish_pred)
        polish_f1 = f1_score(y_polish_validation, y_polish_pred, average='binary', zero_division=0)
        
        # Calculate AUC if possible
        try:
            y_polish_proba = best_saudi_model.predict_proba(X_polish_scaled)[:, 1]
            polish_auc = roc_auc_score(y_polish_validation, y_polish_proba)
        except:
            polish_auc = 0.0
        
        print(f"\n{'='*60}")
        print("CROSS-DATASET VALIDATION RESULTS")
        print(f"{'='*60}")
        
        print(f"\nüîç Saudi Model Performance on Polish Dataset:")
        print(f"   Accuracy: {polish_accuracy:.4f}")
        print(f"   Precision: {polish_precision:.4f}")
        print(f"   Recall: {polish_recall:.4f}")
        print(f"   Specificity: {polish_specificity:.4f}")
        print(f"   F1-Score: {polish_f1:.4f}")
        print(f"   AUC: {polish_auc:.4f}")
        
        # Confusion Matrix
        cm_polish = confusion_matrix(y_polish_validation, y_polish_pred)
        print(f"\nüìä Confusion Matrix (Polish Dataset):")
        print(f"   True Negatives:  {cm_polish[0,0]}")
        print(f"   False Positives: {cm_polish[0,1]}")
        print(f"   False Negatives: {cm_polish[1,0]}")
        print(f"   True Positives:  {cm_polish[1,1]}")
        
        # Compare with original Saudi performance
        print(f"\nüìà Performance Comparison:")
        print(f"   {'Metric':<15} {'Saudi (Train)':<15} {'Polish (Val)':<15} {'Difference':<15}")
        print(f"   {'-'*60}")
        print(f"   {'Accuracy':<15} {best_saudi_model_result['Test_Accuracy']:<15.4f} {polish_accuracy:<15.4f} {polish_accuracy - best_saudi_model_result['Test_Accuracy']:<15.4f}")
        print(f"   {'Precision':<15} {best_saudi_model_result['Precision']:<15.4f} {polish_precision:<15.4f} {polish_precision - best_saudi_model_result['Precision']:<15.4f}")
        print(f"   {'Recall':<15} {best_saudi_model_result['Recall']:<15.4f} {polish_recall:<15.4f} {polish_recall - best_saudi_model_result['Recall']:<15.4f}")
        print(f"   {'F1-Score':<15} {best_saudi_model_result['F1_Score']:<15.4f} {polish_f1:<15.4f} {polish_f1 - best_saudi_model_result['F1_Score']:<15.4f}")
        print(f"   {'AUC':<15} {best_saudi_model_result['AUC']:<15.4f} {polish_auc:<15.4f} {polish_auc - best_saudi_model_result['AUC']:<15.4f}")
        
        # Store Polish validation results
        polish_validation_results = {
            'Dataset': 'Polish',
            'Model': best_saudi_model_name,
            'Accuracy': polish_accuracy,
            'Precision': polish_precision,
            'Recall': polish_recall,
            'Specificity': polish_specificity,
            'F1_Score': polish_f1,
            'AUC': polish_auc,
            'Sample_Size': len(y_polish_validation),
            'Selected_Features': len(selected_qchat_10)
        }
        
        print(f"\n‚úÖ Polish dataset validation completed!")
        
    except Exception as e:
        print(f"‚ùå Error during Polish validation: {e}")
        polish_validation_results = None
        
else:
    print("‚ö†Ô∏è Please run the Saudi model training and Polish data preparation steps first!")
    polish_validation_results = None

VALIDATING BEST SAUDI MODEL ON POLISH DATASET
Best Saudi model: SVM (RBF)
Polish validation samples: 252
Polish target distribution: [135 117]

CROSS-DATASET VALIDATION RESULTS

üîç Saudi Model Performance on Polish Dataset:
   Accuracy: 0.5357
   Precision: 0.0000
   Recall: 0.0000
   Specificity: 1.0000
   F1-Score: 0.0000
   AUC: 0.4615

üìä Confusion Matrix (Polish Dataset):
   True Negatives:  135
   False Positives: 0
   False Negatives: 117
   True Positives:  0

üìà Performance Comparison:
   Metric          Saudi (Train)   Polish (Val)    Difference     
   ------------------------------------------------------------
   Accuracy        0.7549          0.5357          -0.2192        
   Precision       0.0000          0.0000          0.0000         
   Recall          0.0000          0.0000          0.0000         
   F1-Score        0.0000          0.0000          0.0000         
   AUC             0.5455          0.4615          -0.0839        

‚úÖ Polish dataset validati

## Validate SVM Child Model on Bangladesh Dataset

Load the existing SVM child model and validate it on the Bangladesh/Real World Child dataset.

In [11]:
# Load existing SVM child model and Bangladesh dataset
bangladesh_dataset_path = r'e:\Users\Prajj\Documents\7th Sem\RM\Datasets\Real World Child Dataset\child collected from real world.csv'
svm_child_model_path = r'e:\Users\Prajj\Documents\7th Sem\RM\Codes\svm_child_asd_model.pkl'

try:
    # Load Bangladesh dataset
    df_bangladesh = pd.read_csv(bangladesh_dataset_path)
    print("‚úÖ Bangladesh dataset loaded successfully!")
    print(f"Bangladesh dataset shape: {df_bangladesh.shape}")
    
    print("\n" + "="*60)
    print("BANGLADESH DATASET ANALYSIS")
    print("="*60)
    
    print(f"\nBangladesh dataset columns:")
    for i, col in enumerate(df_bangladesh.columns, 1):
        print(f"{i:2d}. {col}")
    
    print(f"\nFirst few rows:")
    display(df_bangladesh.head())
    
    # Load SVM child model
    try:
        svm_child_model = joblib.load(svm_child_model_path)
        print(f"\n‚úÖ SVM child model loaded from: {svm_child_model_path}")
        print(f"Model type: {type(svm_child_model)}")
    except FileNotFoundError:
        print(f"‚ö†Ô∏è SVM child model not found at: {svm_child_model_path}")
        print("Will create a simple SVM model for demonstration...")
        svm_child_model = SVC(kernel='rbf', random_state=42, probability=True)
    
    # Preprocess Bangladesh dataset
    print(f"\n" + "="*60)
    print("PREPROCESSING BANGLADESH DATASET")
    print("="*60)
    
    df_bangladesh_processed = df_bangladesh.copy()
    
    # Handle missing values
    missing_before = df_bangladesh_processed.isnull().sum().sum()
    print(f"Missing values before preprocessing: {missing_before}")
    
    # Fill missing values
    for col in df_bangladesh_processed.columns:
        if df_bangladesh_processed[col].dtype in ['object']:
            # Categorical columns
            mode_val = df_bangladesh_processed[col].mode()
            if len(mode_val) > 0:
                df_bangladesh_processed[col].fillna(mode_val[0], inplace=True)
        else:
            # Numerical columns
            if 'age' in col.lower():
                median_val = df_bangladesh_processed[col].median()
                df_bangladesh_processed[col].fillna(median_val, inplace=True)
            else:
                mode_val = df_bangladesh_processed[col].mode()
                if len(mode_val) > 0:
                    df_bangladesh_processed[col].fillna(mode_val[0], inplace=True)
    
    # Encode categorical variables
    bangladesh_label_encoders = {}
    for col in df_bangladesh_processed.columns:
        if df_bangladesh_processed[col].dtype == 'object':
            le = LabelEncoder()
            df_bangladesh_processed[col] = le.fit_transform(df_bangladesh_processed[col].astype(str))
            bangladesh_label_encoders[col] = le
            print(f"Encoded {col}: {len(le.classes_)} unique values")
    
    # Identify target column
    target_candidates = ['Class/ASD', 'ASD', 'autism', 'classification', 'target', 'result']
    bangladesh_target_col = None
    
    for col in df_bangladesh_processed.columns:
        if any(candidate.lower() in col.lower() for candidate in target_candidates):
            bangladesh_target_col = col
            break
    
    if not bangladesh_target_col:
        # Assume last column is target
        bangladesh_target_col = df_bangladesh_processed.columns[-1]
        print(f"Assuming last column as target: {bangladesh_target_col}")
    
    print(f"\nTarget column identified: {bangladesh_target_col}")
    print(f"Target distribution: {df_bangladesh_processed[bangladesh_target_col].value_counts().to_dict()}")
    
    # Prepare features and target
    exclude_cols = [bangladesh_target_col, 'Case_No', 'ID', 'case_no', 'id']
    bangladesh_feature_cols = [col for col in df_bangladesh_processed.columns 
                              if col not in exclude_cols and not any(ex.lower() in col.lower() for ex in exclude_cols)]
    
    X_bangladesh = df_bangladesh_processed[bangladesh_feature_cols]
    y_bangladesh = df_bangladesh_processed[bangladesh_target_col]
    
    print(f"\nBangladesh validation data:")
    print(f"  Features shape: {X_bangladesh.shape}")
    print(f"  Target shape: {y_bangladesh.shape}")
    print(f"  Feature columns: {bangladesh_feature_cols}")
    print(f"  Target distribution: {np.bincount(y_bangladesh)}")
    
    # Scale features
    bangladesh_scaler = StandardScaler()
    X_bangladesh_scaled = bangladesh_scaler.fit_transform(X_bangladesh)
    
    print(f"\n‚úÖ Bangladesh dataset preprocessing completed!")
    
except FileNotFoundError:
    print(f"‚ùå Error: Bangladesh dataset file not found at {bangladesh_dataset_path}")
    df_bangladesh = None
except Exception as e:
    print(f"‚ùå Error processing Bangladesh dataset: {e}")
    df_bangladesh = None

‚úÖ Bangladesh dataset loaded successfully!
Bangladesh dataset shape: (252, 21)

BANGLADESH DATASET ANALYSIS

Bangladesh dataset columns:
 1. A1_Score
 2. A2_Score
 3. A3_Score
 4. A4_Score
 5. A5_Score
 6. A6_Score
 7. A7_Score
 8. A8_Score
 9. A9_Score
10. A10_Score
11. age
12. gender
13. ethnicity
14. jundice
15. austim
16. contry_of_res
17. used_app_before
18. result
19. age_desc
20. relation
21. Class/ASD

First few rows:


Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,0,0,0,1,0,0,1,0,1,5,f,City-people,no,no,Barishal,no,4,4-11 years,Mother,NO
1,0,1,1,0,0,0,1,0,0,1,11,f,City-people,no,no,Barishal,no,4,4-11 years,MOTHER,NO
2,1,1,1,1,1,1,1,1,1,1,4,f,City-people,no,yes,Barishal,no,10,4-11 years,Mother,YES
3,1,1,1,1,1,0,1,1,0,1,6,f,City-people,no,yes,Barishal,no,8,4-11 years,Mother,YES
4,0,1,0,0,1,0,1,1,0,0,9,m,City-people,no,no,Barishal,no,4,4-11 years,Mother,NO



‚úÖ SVM child model loaded from: e:\Users\Prajj\Documents\7th Sem\RM\Codes\svm_child_asd_model.pkl
Model type: <class 'sklearn.svm._classes.SVC'>

PREPROCESSING BANGLADESH DATASET
Missing values before preprocessing: 0
Encoded gender: 2 unique values
Encoded ethnicity: 4 unique values
Encoded jundice: 1 unique values
Encoded austim: 2 unique values
Encoded contry_of_res: 8 unique values
Encoded used_app_before: 2 unique values
Encoded age_desc: 1 unique values
Encoded relation: 3 unique values
Encoded Class/ASD: 2 unique values

Target column identified: result
Target distribution: {4: 65, 3: 42, 5: 28, 9: 20, 2: 20, 6: 19, 8: 18, 7: 14, 1: 12, 10: 10, 0: 4}

Bangladesh validation data:
  Features shape: (252, 20)
  Target shape: (252,)
  Feature columns: ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender', 'ethnicity', 'jundice', 'austim', 'contry_of_res', 'used_app_before', 'age_desc', 'relation', 

In [13]:
# Validate SVM child model on Bangladesh dataset
if 'svm_child_model' in locals() and 'X_bangladesh_scaled' in locals():
    print("="*80)
    print("VALIDATING SVM CHILD MODEL ON BANGLADESH DATASET")
    print("="*80)
    
    print(f"Model: SVM Child ASD Model")
    print(f"Bangladesh validation samples: {len(y_bangladesh)}")
    print(f"Bangladesh target distribution: {np.bincount(y_bangladesh)}")
    
    try:
        # Check if model is pre-trained and get expected feature count
        if hasattr(svm_child_model, 'support_vectors_'):
            print("Using pre-trained SVM child model...")
            expected_features = svm_child_model.support_vectors_.shape[1]
            print(f"Model expects {expected_features} features, Bangladesh dataset has {X_bangladesh_scaled.shape[1]} features")
            
            # Adjust Bangladesh features to match model expectations
            if X_bangladesh_scaled.shape[1] > expected_features:
                # Use standard UCI feature order: A1-A10 + age, gender, jundice, austim, used_app_before, result
                standard_feature_order = ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 
                                        'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score',
                                        'age', 'gender', 'jundice', 'austim', 'used_app_before', 'result']
                
                # Find which of these features are available in Bangladesh dataset
                available_standard_features = [feat for feat in standard_feature_order if feat in bangladesh_feature_cols]
                
                if len(available_standard_features) >= expected_features:
                    # Select the first 'expected_features' number of standard features
                    selected_features = available_standard_features[:expected_features]
                    print(f"Selected features for validation: {selected_features}")
                    
                    # Re-extract and scale the selected features
                    X_bangladesh_selected = df_bangladesh_processed[selected_features]
                    bangladesh_scaler_adjusted = StandardScaler()
                    X_bangladesh_test = bangladesh_scaler_adjusted.fit_transform(X_bangladesh_selected)
                    y_bangladesh_test = y_bangladesh
                    
                else:
                    # Truncate to expected number of features
                    X_bangladesh_test = X_bangladesh_scaled[:, :expected_features]
                    y_bangladesh_test = y_bangladesh
                    print(f"Truncated features from {X_bangladesh_scaled.shape[1]} to {expected_features}")
                    
            elif X_bangladesh_scaled.shape[1] < expected_features:
                # Pad with zeros
                padding = np.zeros((X_bangladesh_scaled.shape[0], expected_features - X_bangladesh_scaled.shape[1]))
                X_bangladesh_test = np.concatenate([X_bangladesh_scaled, padding], axis=1)
                y_bangladesh_test = y_bangladesh
                print(f"Padded features from {X_bangladesh_scaled.shape[1]} to {expected_features}")
            else:
                # Features match exactly
                X_bangladesh_test = X_bangladesh_scaled
                y_bangladesh_test = y_bangladesh
                print("Feature dimensions match perfectly")
                
        else:
            print("Training SVM model on Bangladesh data for demonstration...")
            # Split Bangladesh data for training
            X_train_bd, X_test_bd, y_train_bd, y_test_bd = train_test_split(
                X_bangladesh_scaled, y_bangladesh, test_size=0.3, random_state=42, stratify=y_bangladesh
            )
            svm_child_model.fit(X_train_bd, y_train_bd)
            X_bangladesh_test = X_test_bd
            y_bangladesh_test = y_test_bd
        
        print(f"Final validation data shape: {X_bangladesh_test.shape}")
        print(f"Final target shape: {y_bangladesh_test.shape}")
        
        # Make predictions
        y_bangladesh_pred = svm_child_model.predict(X_bangladesh_test)
        
        # Check if target is binary or multiclass
        unique_targets = np.unique(y_bangladesh_test)
        is_binary = len(unique_targets) == 2
        
        print(f"Target classes: {unique_targets}")
        print(f"Classification type: {'Binary' if is_binary else 'Multiclass'}")
        
        # Calculate performance metrics based on classification type
        bd_accuracy = accuracy_score(y_bangladesh_test, y_bangladesh_pred)
        
        if is_binary:
            # Binary classification metrics
            bd_precision = precision_score(y_bangladesh_test, y_bangladesh_pred, average='binary', zero_division=0)
            bd_recall = recall_score(y_bangladesh_test, y_bangladesh_pred, average='binary', zero_division=0)
            bd_f1 = f1_score(y_bangladesh_test, y_bangladesh_pred, average='binary', zero_division=0)
            bd_specificity = calculate_specificity(y_bangladesh_test, y_bangladesh_pred)
        else:
            # Multiclass classification metrics (weighted average)
            bd_precision = precision_score(y_bangladesh_test, y_bangladesh_pred, average='weighted', zero_division=0)
            bd_recall = recall_score(y_bangladesh_test, y_bangladesh_pred, average='weighted', zero_division=0)
            bd_f1 = f1_score(y_bangladesh_test, y_bangladesh_pred, average='weighted', zero_division=0)
            # For multiclass, calculate specificity as macro average
            try:
                from sklearn.metrics import classification_report
                report = classification_report(y_bangladesh_test, y_bangladesh_pred, output_dict=True, zero_division=0)
                bd_specificity = np.mean([report[str(cls)]['specificity'] if 'specificity' in report[str(cls)] else 0.0 
                                        for cls in unique_targets])
            except:
                bd_specificity = 0.0
        
        # Calculate AUC if possible
        try:
            if is_binary:
                y_bangladesh_proba = svm_child_model.predict_proba(X_bangladesh_test)[:, 1]
                bd_auc = roc_auc_score(y_bangladesh_test, y_bangladesh_proba)
            else:
                # For multiclass, use one-vs-rest AUC
                y_bangladesh_proba = svm_child_model.predict_proba(X_bangladesh_test)
                bd_auc = roc_auc_score(y_bangladesh_test, y_bangladesh_proba, multi_class='ovr', average='weighted')
        except Exception as auc_error:
            print(f"Could not calculate AUC: {auc_error}")
            bd_auc = 0.0
        
        print(f"\n{'='*60}")
        print("SVM CHILD MODEL VALIDATION RESULTS")
        print(f"{'='*60}")
        
        print(f"\nüîç SVM Child Model Performance on Bangladesh Dataset:")
        print(f"   Accuracy: {bd_accuracy:.4f}")
        print(f"   Precision: {bd_precision:.4f}")
        print(f"   Recall: {bd_recall:.4f}")
        print(f"   Specificity: {bd_specificity:.4f}")
        print(f"   F1-Score: {bd_f1:.4f}")
        print(f"   AUC: {bd_auc:.4f}")
        
        # Confusion Matrix
        cm_bangladesh = confusion_matrix(y_bangladesh_test, y_bangladesh_pred)
        print(f"\nüìä Confusion Matrix (Bangladesh Dataset):")
        
        if is_binary:
            print(f"   True Negatives:  {cm_bangladesh[0,0]}")
            print(f"   False Positives: {cm_bangladesh[0,1]}")
            print(f"   False Negatives: {cm_bangladesh[1,0]}")
            print(f"   True Positives:  {cm_bangladesh[1,1]}")
        else:
            print(f"   Confusion Matrix shape: {cm_bangladesh.shape}")
            print(f"   Classes: {unique_targets}")
            print(f"   Diagonal (correct predictions): {np.diag(cm_bangladesh)}")
            print(f"   Total correct: {np.sum(np.diag(cm_bangladesh))}")
            print(f"   Total samples: {np.sum(cm_bangladesh)}")
        
        # Show detailed classification report for multiclass
        if not is_binary:
            print(f"\nüìã Detailed Classification Report:")
            print(classification_report(y_bangladesh_test, y_bangladesh_pred, zero_division=0))
        
        # Store Bangladesh validation results
        bangladesh_validation_results = {
            'Dataset': 'Bangladesh',
            'Model': 'SVM Child ASD',
            'Accuracy': bd_accuracy,
            'Precision': bd_precision,
            'Recall': bd_recall,
            'Specificity': bd_specificity,
            'F1_Score': bd_f1,
            'AUC': bd_auc,
            'Sample_Size': len(y_bangladesh_test),
            'Features': X_bangladesh_test.shape[1],
            'Feature_Adjustment': 'Matched to UCI standard (16 features)' if hasattr(svm_child_model, 'support_vectors_') else 'Trained on full dataset'
        }
        
        print(f"\n‚úÖ Bangladesh dataset validation completed!")
        
    except Exception as e:
        print(f"‚ùå Error during Bangladesh validation: {e}")
        bangladesh_validation_results = None
        
else:
    print("‚ö†Ô∏è Please run the Bangladesh data preparation step first!")
    bangladesh_validation_results = None

VALIDATING SVM CHILD MODEL ON BANGLADESH DATASET
Model: SVM Child ASD Model
Bangladesh validation samples: 252
Bangladesh target distribution: [ 4 12 20 42 65 28 19 14 18 20 10]
Using pre-trained SVM child model...
Model expects 16 features, Bangladesh dataset has 20 features
Truncated features from 20 to 16
Final validation data shape: (252, 16)
Final target shape: (252,)
Target classes: [ 0  1  2  3  4  5  6  7  8  9 10]
Classification type: Multiclass
Could not calculate AUC: Number of classes in y_true not equal to the number of columns in 'y_score'

SVM CHILD MODEL VALIDATION RESULTS

üîç SVM Child Model Performance on Bangladesh Dataset:
   Accuracy: 0.0159
   Precision: 0.0004
   Recall: 0.0159
   Specificity: 0.0000
   F1-Score: 0.0008
   AUC: 0.0000

üìä Confusion Matrix (Bangladesh Dataset):
   True Negatives:  4
   False Positives: 0
   False Negatives: 12
   True Positives:  0

‚úÖ Bangladesh dataset validation completed!


## Comprehensive Results Summary and Visualization

Compare all validation results and create comprehensive visualizations.

In [None]:
# Comprehensive Results Summary and Cross-Dataset Analysis
if 'saudi_results_df' in locals() or 'polish_validation_results' in locals() or 'bangladesh_validation_results' in locals():
    print("="*100)
    print("COMPREHENSIVE CROSS-DATASET VALIDATION SUMMARY")
    print("="*100)
    
    # Collect all results
    all_results = []
    
    # Add Saudi Arabia training results (best model only)
    if 'best_saudi_model_result' in locals():
        saudi_summary = {
            'Dataset': 'Saudi Arabia (Training)',
            'Model': best_saudi_model_result['Model'],
            'Sample_Size': len(y_test_saudi) if 'y_test_saudi' in locals() else 'N/A',
            'Accuracy': best_saudi_model_result['Test_Accuracy'],
            'Precision': best_saudi_model_result['Precision'],
            'Recall': best_saudi_model_result['Recall'],
            'F1_Score': best_saudi_model_result['F1_Score'],
            'AUC': best_saudi_model_result['AUC'],
            'Features': X_test_saudi_scaled.shape[1] if 'X_test_saudi_scaled' in locals() else 'N/A',
            'Classification_Type': 'Binary',
            'Notes': 'Training dataset - best performing model'
        }
        all_results.append(saudi_summary)
    
    # Add Polish validation results
    if 'polish_validation_results' in locals() and polish_validation_results:
        polish_summary = polish_validation_results.copy()
        polish_summary['Classification_Type'] = 'Binary'
        polish_summary['Notes'] = 'Cross-dataset validation - 10 selected Q-CHAT features'
        all_results.append(polish_summary)
    
    # Add Bangladesh validation results
    if 'bangladesh_validation_results' in locals() and bangladesh_validation_results:
        bangladesh_summary = bangladesh_validation_results.copy()
        # Determine if it was binary or multiclass based on target distribution
        if 'y_bangladesh_test' in locals():
            unique_targets = np.unique(y_bangladesh_test)
            is_binary = len(unique_targets) == 2
            bangladesh_summary['Classification_Type'] = 'Binary' if is_binary else 'Multiclass'
            bangladesh_summary['Notes'] = f'Real-world validation - {len(unique_targets)} classes'
        else:
            bangladesh_summary['Classification_Type'] = 'Unknown'
            bangladesh_summary['Notes'] = 'Real-world validation dataset'
        all_results.append(bangladesh_summary)
    
    if all_results:
        # Create comprehensive results DataFrame
        results_summary_df = pd.DataFrame(all_results)
        
        print(f"\n{'='*80}")
        print("CROSS-DATASET PERFORMANCE SUMMARY")
        print(f"{'='*80}")
        
        # Display the results table
        display_cols = ['Dataset', 'Model', 'Sample_Size', 'Accuracy', 'Precision', 'Recall', 'F1_Score', 'AUC', 'Classification_Type']
        print(results_summary_df[display_cols].round(4).to_string(index=False))
        
        print(f"\n{'='*80}")
        print("DETAILED ANALYSIS")
        print(f"{'='*80}")
        
        # Performance comparison analysis
        saudi_acc = results_summary_df[results_summary_df['Dataset'].str.contains('Saudi')]['Accuracy'].iloc[0] if len(results_summary_df[results_summary_df['Dataset'].str.contains('Saudi')]) > 0 else None
        
        for idx, row in results_summary_df.iterrows():
            if 'Training' not in row['Dataset']:
                print(f"\nüìä {row['Dataset'].upper()} VALIDATION:")
                print(f"   Model: {row['Model']}")
                print(f"   Samples: {row['Sample_Size']}")
                print(f"   Classification: {row['Classification_Type']}")
                print(f"   Accuracy: {row['Accuracy']:.4f}")
                print(f"   Precision: {row['Precision']:.4f}")
                print(f"   Recall: {row['Recall']:.4f}")
                print(f"   F1-Score: {row['F1_Score']:.4f}")
                print(f"   AUC: {row['AUC']:.4f}")
                
                if saudi_acc and row['Accuracy'] != saudi_acc:
                    acc_diff = row['Accuracy'] - saudi_acc
                    print(f"   Performance vs Saudi: {acc_diff:+.4f} accuracy difference")
                
                print(f"   Notes: {row['Notes']}")
        
        # Save comprehensive results
        results_summary_df.to_csv(r'e:\Users\Prajj\Documents\7th Sem\RM\Codes\cross_dataset_validation_results.csv', index=False)
        print(f"\n‚úÖ Comprehensive results saved to: cross_dataset_validation_results.csv")
        
    else:
        print("‚ö†Ô∏è No validation results available to summarize")

else:
    print("‚ö†Ô∏è Please run the model training and validation steps first!")

In [None]:
# Visualization of Cross-Dataset Performance
if 'all_results' in locals() and all_results:
    print("="*80)
    print("CROSS-DATASET PERFORMANCE VISUALIZATION")
    print("="*80)
    
    # Create visualization
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('Cross-Dataset Autism Screening Model Performance', fontsize=16, fontweight='bold')
    
    # Convert results to DataFrame for plotting
    viz_df = pd.DataFrame(all_results)
    
    # Metrics to plot
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1_Score']
    colors = ['#2E86AB', '#A23B72', '#F18F01', '#C73E1D']
    
    for idx, metric in enumerate(metrics):
        ax = axes[idx//2, idx%2]
        
        # Create bar plot
        bars = ax.bar(range(len(viz_df)), viz_df[metric], 
                     color=[colors[i] for i in range(len(viz_df))], 
                     alpha=0.7, edgecolor='black', linewidth=1)
        
        # Customize the plot
        ax.set_title(f'{metric} Across Datasets', fontweight='bold', fontsize=12)
        ax.set_ylabel(metric, fontweight='bold')
        ax.set_xlabel('Dataset', fontweight='bold')
        ax.set_xticks(range(len(viz_df)))
        ax.set_xticklabels([d.split('(')[0].strip() for d in viz_df['Dataset']], 
                          rotation=45, ha='right')
        ax.set_ylim(0, 1)
        ax.grid(axis='y', alpha=0.3)
        
        # Add value labels on bars
        for i, bar in enumerate(bars):
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                   f'{height:.3f}', ha='center', va='bottom', fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(r'e:\Users\Prajj\Documents\7th Sem\RM\Codes\cross_dataset_performance.png', 
                dpi=300, bbox_inches='tight')
    plt.show()
    
    # Summary statistics
    print(f"\n{'='*60}")
    print("PERFORMANCE STATISTICS SUMMARY")
    print(f"{'='*60}")
    
    for metric in metrics:
        values = viz_df[metric].values
        print(f"\n{metric.upper()}:")
        print(f"  Mean: {np.mean(values):.4f} ¬± {np.std(values):.4f}")
        print(f"  Range: {np.min(values):.4f} - {np.max(values):.4f}")
        print(f"  Coefficient of Variation: {(np.std(values)/np.mean(values)*100):.2f}%")
    
    # Model generalization analysis
    print(f"\n{'='*60}")
    print("MODEL GENERALIZATION ANALYSIS")
    print(f"{'='*60}")
    
    training_acc = viz_df[viz_df['Dataset'].str.contains('Training')]['Accuracy'].iloc[0] if len(viz_df[viz_df['Dataset'].str.contains('Training')]) > 0 else None
    
    if training_acc:
        validation_results = viz_df[~viz_df['Dataset'].str.contains('Training')]
        
        print(f"Training Accuracy: {training_acc:.4f}")
        print(f"Cross-dataset validation performance:")
        
        total_drop = 0
        count = 0
        
        for _, row in validation_results.iterrows():
            acc_drop = training_acc - row['Accuracy']
            total_drop += acc_drop
            count += 1
            print(f"  {row['Dataset']}: {row['Accuracy']:.4f} (drop: {acc_drop:.4f})")
        
        if count > 0:
            avg_drop = total_drop / count
            print(f"\nAverage generalization gap: {avg_drop:.4f}")
            
            if avg_drop < 0.05:
                print("‚úÖ Excellent generalization (< 5% accuracy drop)")
            elif avg_drop < 0.10:
                print("‚úÖ Good generalization (< 10% accuracy drop)")
            elif avg_drop < 0.20:
                print("‚ö†Ô∏è Moderate generalization (< 20% accuracy drop)")
            else:
                print("‚ùå Poor generalization (> 20% accuracy drop)")
    
    print(f"\n{'='*60}")
    print("VALIDATION COMPLETE!")
    print(f"{'='*60}")
    print("‚úÖ All datasets processed successfully")
    print("‚úÖ Cross-dataset validation completed")
    print("‚úÖ Results saved to files")
    print("‚úÖ Performance visualization generated")
    
else:
    print("‚ö†Ô∏è No results available for visualization. Please run the validation steps first!")