In [4]:
# model_train.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import joblib

# Load data
df = pd.read_csv('D:\oasis_longitudinal.csv')

# Preprocessing
df = df.drop(['Subject ID', 'MRI ID'], axis=1)

# Convert categorical variables
df['M/F'] = df['M/F'].map({'M': 1, 'F': 0})
df['Hand'] = df['Hand'].map({'R': 1, 'L': 0})

# Handle missing values
df = df.dropna(subset=['Group'])  # Remove rows with missing target

# Prepare features and target
X = df.drop('Group', axis=1)
y = df['Group']

# Encode target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_imputed, y_train)

# Evaluate
y_pred = model.predict(X_test_imputed)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save artifacts
joblib.dump(model, 'model.pkl')
joblib.dump(imputer, 'imputer.pkl')
joblib.dump(le, 'label_encoder.pkl')
joblib.dump(X.columns.tolist(), 'feature_columns.pkl')

Accuracy: 0.87

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.27      0.40        11
           1       0.94      0.97      0.95        32
           2       0.82      0.97      0.89        32

    accuracy                           0.87        75
   macro avg       0.84      0.74      0.75        75
weighted avg       0.86      0.87      0.84        75



['feature_columns.pkl']