<a href="https://colab.research.google.com/github/Santhu489/autismm/blob/main/autsim_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.over_sampling import RandomOverSampler

# Load the data
genes = pd.read_csv("drive/MyDrive/Gene/sfari_genes.csv")

# Drop unnecessary columns
columns_to_drop = ['status', 'chromosome', 'number-of-reports', 'gene-name', 'ensembl-id', 'gene-score', 'genetic-category']
genes = genes.drop(columns=columns_to_drop)

# Encode gene symbols as dummy variables
genes_encoded = pd.get_dummies(genes, columns=['gene-symbol'])

# Features (X) excluding the 'syndromic' column
X = genes_encoded.drop(columns='syndromic')

# Labels (y)
y = genes_encoded['syndromic']

# Convert to binary classification (1 for syndromic, 0 for non-syndromic)
y_binary = (y == 1).astype(int)

# Resample the dataset
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y_binary)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize the classifier
classifiers = {
    'XGBoost': XGBClassifier(),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier()
}

# Train and evaluate each classifier on the resampled data
for clf_name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Print results
    print(f"\nResults for {clf_name} on resampled data:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # Classification Report
    report = classification_report(y_test, y_pred)
    print(f"Classification Report for {clf_name} on resampled data:\n{report}")

# Print autism syndromic genes
print(f"\nAutism Syndromic Genes:\n{genes[['gene-symbol', 'syndromic']]}")

# Get user input for a gene symbol
gene_symbol = input("Enter a gene symbol: ")

# Check if the gene symbol exists in the data
if gene_symbol in genes['gene-symbol'].values:
    # Extract the corresponding row from the dataframe
    gene_info = genes[genes['gene-symbol'] == gene_symbol]

    # Check if the gene is syndromic or not
    if gene_info['syndromic'].values[0] == 1:
        print(f"The gene {gene_symbol} is associated with autism.")
    else:
        print(f"The gene {gene_symbol} is not associated with autism.")
else:
    print("The gene symbol does not exist in the data.")


Results for XGBoost on resampled data:
Accuracy: 0.5521
Precision: 1.0000
Recall: 0.0897
F1 Score: 0.1647
Classification Report for XGBoost on resampled data:
              precision    recall  f1-score   support

           0       0.53      1.00      0.69       161
           1       1.00      0.09      0.16       156

    accuracy                           0.55       317
   macro avg       0.77      0.54      0.43       317
weighted avg       0.76      0.55      0.43       317


Results for SVM on resampled data:
Accuracy: 0.9685
Precision: 1.0000
Recall: 0.9359
F1 Score: 0.9669
Classification Report for SVM on resampled data:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       161
           1       1.00      0.94      0.97       156

    accuracy                           0.97       317
   macro avg       0.97      0.97      0.97       317
weighted avg       0.97      0.97      0.97       317


Results for Random Forest on resa

In [None]:
import pickle

# Save each trained classifier to a pickle file
for clf_name, clf in classifiers.items():
    filename = f"{clf_name}_model.pkl"
    with open(filename, 'wb') as file:
        pickle.dump(clf, file)
    print(f"{clf_name} model saved to {filename}")

XGBoost model saved to XGBoost_model.pkl
SVM model saved to SVM_model.pkl
Random Forest model saved to Random Forest_model.pkl
