In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,  GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [4]:
PATH = r"C:\Users\udmul\OneDrive\Desktop\MINI\VOC-ALS.xlsx"
df = pd.read_excel(PATH,header = 1)

In [6]:
exclude_keywords = ['monologue', 'reading', 'weekdays']

feature_cols = ['Age (years)', 'Sex'] + [
    col for col in df.columns
    if any(x in col for x in ['meanF0Hz_', 'stdevF0Hz_', 'HNR_', 'localJitter_', 'localShimmer_'])
    and not any(ex in col for ex in exclude_keywords)
]

X = df[feature_cols].copy()

In [8]:
# Encode 'Sex'
X['Sex'] = LabelEncoder().fit_transform(X['Sex'])

# Encode target
y = df['Category'].apply(lambda x: 1 if x == 'ALS' else 0)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [12]:
# Combine features and labels
train_data = X.copy()
train_data['Category'] = y.values

In [14]:
from sdv.metadata import Metadata
from sdv.single_table import CTGANSynthesizer

metadata = Metadata.detect_from_dataframe(
    data=train_data,
)
'''metadata.save_to_json(filepath='my_metadata_v2.json')'''
metadata = Metadata.load_from_json(filepath='my_metadata_v2.json')


synth = CTGANSynthesizer(metadata)

In [18]:
synth.fit(train_data)

synthetic_data = synth.sample(num_rows=2000)

In [20]:
synth.save(
    filepath='ALS_synth_3.0.pkl'
)

In [22]:
from sdv.single_table import CTGANSynthesizer

synth = CTGANSynthesizer.load(
    filepath='ALS_synth_3.0.pkl'
)

In [24]:
from sdv.evaluation.single_table import run_diagnostic, evaluate_quality
from sdv.evaluation.single_table import get_column_plot

# 1. perform basic validity checks
diagnostic = run_diagnostic(train_data, synthetic_data, metadata)

# 2. measure the statistical similarity
quality_report = evaluate_quality(train_data, synthetic_data, metadata)

# 3. plot the data
fig = get_column_plot(
    real_data=train_data,
    synthetic_data=synthetic_data,
    metadata=metadata,
    column_name='Category'
)
    
fig.show()

Generating report ...

(1/2) Evaluating Data Validity: |███████████████████████████████████████████████████| 43/43 [00:00<00:00, 2689.22it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |█████████████████████████████████████████████████████| 1/1 [00:00<00:00, 111.11it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%

Generating report ...

(1/2) Evaluating Column Shapes: |████████████████████████████████████████████████████| 43/43 [00:00<00:00, 623.18it/s]|
Column Shapes Score: 71.39%

(2/2) Evaluating Column Pair Trends: |█████████████████████████████████████████████| 903/903 [00:02<00:00, 362.44it/s]|
Column Pair Trends Score: 81.73%

Overall Score (Average): 76.56%



In [26]:
# Separate features and target
X_synth = synthetic_data.drop(columns='Category')
y_synth = synthetic_data['Category']

In [28]:
# Combine real and synthetic data
X_combined = pd.concat([X, X_synth], axis=0)
y_combined = pd.concat([y, y_synth], axis=0)

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectFromModel

from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

In [34]:
# 5. Feature Scaling - Apply proper scaling to all features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [36]:
import pickle

# Save the scaler
with open("als_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [38]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score,accuracy_score

In [89]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)

# Evaluate on your original test set
from sklearn.metrics import accuracy_score

y_pred = xgb_model.predict(X_test)
print("Accuracy on test set:", accuracy_score(y_test, y_pred))

Accuracy on test set: 0.7461832061068703


In [91]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.72      0.73       252
           1       0.74      0.76      0.75       272

    accuracy                           0.74       524
   macro avg       0.74      0.74      0.74       524
weighted avg       0.74      0.74      0.74       524



In [93]:
y_proba = xgb_model.predict_proba(X_test)[:, 1]
y_pred_custom = (y_proba > 0.4).astype(int) 
print("F1 Score:", f1_score(y_test, y_pred_custom))
print("Accuracy:", accuracy_score(y_test, y_pred_custom))

F1 Score: 0.7759562841530054
Accuracy: 0.7652671755725191


In [95]:
import joblib
joblib.dump(model, 'model_rem.pkl')
joblib.dump(xgb_model, 'xgb_model_rem.pkl')

['xgb_model_rem.pkl']

In [46]:
synthetic_data = synth.sample(num_rows=2000)

In [48]:
# Separate features and target
X_synth = synthetic_data.drop(columns='Category')
y_synth = synthetic_data['Category']

In [50]:
# Combine real and synthetic data
X_combined = pd.concat([X, X_synth], axis=0)
y_combined = pd.concat([y, y_synth], axis=0)

In [56]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_combined, y_combined)


`BaseEstimator._validate_data` is deprecated in 1.6 and will be removed in 1.7. Use `sklearn.utils.validation.validate_data` instead. This function becomes public and is part of the scikit-learn developer API.



In [58]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

In [61]:
# 5. Feature Scaling - Apply proper scaling to all features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [63]:
import pickle

# Save the scaler
with open("als_scaler2.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [65]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score,accuracy_score

In [67]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize the model
log_reg = LogisticRegression(random_state=42)

# Fit the model to training data
log_reg.fit(X_train, y_train)

# Predict on test data
y_pred = log_reg.predict(X_test)

# Evaluate performance
print("Accuracy on test set:", accuracy_score(y_test, y_pred))

Accuracy on test set: 0.6412213740458015


In [69]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(class_weight='balanced')
# Fit the model to training data
log_reg.fit(X_train, y_train)

# Predict on test data
y_pred = log_reg.predict(X_test)

# Evaluate performance
print("Accuracy on test set:", accuracy_score(y_test, y_pred))

Accuracy on test set: 0.6450381679389313


In [71]:
from xgboost import XGBClassifier
model = XGBClassifier(scale_pos_weight=1.5)

In [73]:
from sklearn.model_selection import GridSearchCV

params = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

grid = GridSearchCV(LogisticRegression(), params, scoring='f1')
grid.fit(X_train, y_train)

In [75]:
print("Best Parameters:", grid.best_params_)
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
from sklearn.metrics import f1_score, accuracy_score

print("F1 Score:", f1_score(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

Best Parameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Confusion Matrix:
 [[155  97]
 [ 92 180]]

Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.62      0.62       252
           1       0.65      0.66      0.66       272

    accuracy                           0.64       524
   macro avg       0.64      0.64      0.64       524
weighted avg       0.64      0.64      0.64       524

F1 Score: 0.6557377049180327
Accuracy: 0.6393129770992366


In [77]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5, scoring='f1_macro')
print("Cross-validated F1:", scores.mean())

Cross-validated F1: 0.4735910949854353


In [79]:
from xgboost import XGBClassifier

# Initialize the model
model = XGBClassifier()

# Fit the model on training data
model.fit(X_train, y_train)

# Now you can get probabilities
y_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for class 1

y_proba = model.predict_proba(X_test)[:, 1]
y_pred_custom = (y_proba > 0.4).astype(int)  # Lower threshold to catch more class 0

In [81]:
print("F1 Score:", f1_score(y_test, y_pred_custom))
print("Accuracy:", accuracy_score(y_test, y_pred_custom))

F1 Score: 0.7759562841530054
Accuracy: 0.7652671755725191


In [83]:
knn = KNeighborsClassifier(n_neighbors=3)  # You can try different k values
knn.fit(X_train, y_train)

# Predict and evaluate
y_pred_knn = knn.predict(X_test)

# Accuracy
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("KNN Accuracy:", accuracy_knn)

# Other metrics
print("\nClassification Report:\n", classification_report(y_test, y_pred_knn))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))

KNN Accuracy: 0.5896946564885496

Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.81      0.65       252
           1       0.69      0.39      0.49       272

    accuracy                           0.59       524
   macro avg       0.62      0.60      0.57       524
weighted avg       0.62      0.59      0.57       524


Confusion Matrix:
 [[204  48]
 [167 105]]


In [87]:
import joblib
joblib.dump(xgb_model, 'xgb_model2.pkl')

['xgb_model2.pkl']