<a href="https://colab.research.google.com/github/Srayoshi-Mirza/Genetic-Disorder-Prediction-System-with-Streamlit/blob/main/Genetic_Disorder_Prediction_System_with_Streamlit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install streamlit

In [None]:
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Load train and test data from URLs
train_data = pd.read_csv('https://raw.githubusercontent.com/Srayoshi-Mirza/Genetic-Disorder-Prediction-System-with-Streamlit/main/train_genetic_disorders.csv')
test_data = pd.read_csv('https://raw.githubusercontent.com/Srayoshi-Mirza/Genetic-Disorder-Prediction-System-with-Streamlit/main/test_genetic_disorders.csv')

# Combine train and test data for preprocessing
combined_data = pd.concat([train_data, test_data], ignore_index=True)

# Define a function to handle missing values
def impute_missing_values(data):
    # Impute missing values in numerical columns with the mean
    numerical_cols = data.select_dtypes(include=[np.number]).columns
    data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].mean())

    # Impute missing values in categorical columns with a placeholder value
    categorical_cols = data.select_dtypes(exclude=[np.number]).columns
    data[categorical_cols] = data[categorical_cols].fillna('Unknown')

    return data

# Handle missing values
combined_data = impute_missing_values(combined_data)

# Identify categorical columns
categorical_columns = ['Genes in mother\'s side', 'Inherited from father', 'Maternal gene', 'Paternal gene', 'Gender', 'Birth asphyxia', 'Autopsy shows birth defect (if applicable)', 'Place of birth', 'Folic acid details (peri-conceptional)', 'H/O serious maternal illness', 'H/O radiation exposure (x-ray)', 'H/O substance abuse', 'Assisted conception IVF/ART', 'History of anomalies in previous pregnancies', 'Blood test result']

# Identify symptom columns
symptom_columns = ['Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5']

# Create separate dataframes for categorical and symptom columns
combined_data_categorical = combined_data[categorical_columns]
combined_data_symptoms = combined_data[symptom_columns]

# Encode categorical variables using one-hot encoding
one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Create a column transformer to apply one-hot encoding to categorical columns
column_transformer = ColumnTransformer(
    transformers=[('one_hot', one_hot_encoder, categorical_columns)],
    remainder='passthrough'
)

# Split the combined data into features (X) and the target variable (y)
X_categorical = column_transformer.fit_transform(combined_data_categorical)
X_numerical = combined_data_symptoms  # Use symptom columns as they are (no standardization)
X = pd.concat([pd.DataFrame(X_categorical), X_numerical], axis=1)
y = combined_data['Birth defects']

# Convert feature names to strings
X.columns = X.columns.astype(str)

# Standardize only the numerical features
scaler = StandardScaler()
X_numerical = scaler.fit_transform(X_numerical)
X = pd.concat([pd.DataFrame(X_categorical), pd.DataFrame(X_numerical)], axis=1)

# Train a Logistic Regression model
model_lr = LogisticRegression()

# Fit the model to the combined data
model_lr.fit(X, y)

# Create a Streamlit app
st.title("Genetic Disorder Prediction App")

# User input for symptoms
st.subheader("Input Symptoms for Prediction")
user_input = {}

for feature in symptom_columns:
    user_input[feature] = st.number_input(f"Enter {feature}", value=0.0)

# Prepare the user input data
user_input_data = pd.DataFrame([user_input])
user_input_data = impute_missing_values(user_input_data)  # Handle missing values
user_input_data_categorical = user_input_data[categorical_columns]
user_input_data_symptoms = user_input_data[symptom_columns]

# Apply one-hot encoding to user input categorical data
user_input_data_categorical = column_transformer.transform(user_input_data_categorical)

# Concatenate one-hot encoded categorical data with user input symptoms
user_input_data_encoded = pd.concat([pd.DataFrame(user_input_data_categorical), user_input_data_symptoms], axis=1)

# Standardize only the numerical features in user input data
user_input_data_symptoms = scaler.transform(user_input_data_symptoms)
user_input_data_encoded = pd.concat([pd.DataFrame(user_input_data_categorical), pd.DataFrame(user_input_data_symptoms)], axis=1)

# Make a prediction using user input
prediction = model_lr.predict(user_input_data_encoded)

# Display the prediction to the user
st.subheader("Prediction Result")
st.write("Predicted Birth Defects:")
st.write(prediction[0])




ValueError: ignored