In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt
from xgboost import XGBRegressor

In [2]:
dataset = pd.read_csv("asthma_disease_data.csv", index_col=None)
df2 = dataset
df2 = pd.get_dummies(df2, drop_first=True)

indep_X = df2.drop(['PatientID','Ethnicity','EducationLevel','PhysicalActivity','GastroesophagealReflux','PollenExposure'], axis=1)
dep_Y = df2['Diagnosis']

In [3]:
dataset

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,PhysicalActivity,DietQuality,SleepQuality,...,LungFunctionFEV1,LungFunctionFVC,Wheezing,ShortnessOfBreath,ChestTightness,Coughing,NighttimeSymptoms,ExerciseInduced,Diagnosis,DoctorInCharge
0,5034,63,0,1,0,15.848744,0,0.894448,5.488696,8.701003,...,1.369051,4.941206,0,0,1,0,0,1,0,Dr_Confid
1,5035,26,1,2,2,22.757042,0,5.897329,6.341014,5.153966,...,2.197767,1.702393,1,0,0,1,1,1,0,Dr_Confid
2,5036,57,0,2,1,18.395396,0,6.739367,9.196237,6.840647,...,1.698011,5.022553,1,1,1,0,1,1,0,Dr_Confid
3,5037,40,1,2,1,38.515278,0,1.404503,5.826532,4.253036,...,3.032037,2.300159,1,0,1,1,1,0,0,Dr_Confid
4,5038,61,0,0,3,19.283802,0,4.604493,3.127048,9.625799,...,3.470589,3.067944,1,1,1,0,0,1,0,Dr_Confid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,7421,43,1,0,2,29.059613,0,3.019854,6.119637,8.300960,...,3.125249,5.166032,0,1,0,0,0,1,1,Dr_Confid
2388,7422,18,1,0,1,20.740850,0,5.805180,4.386992,7.731192,...,1.132977,5.509502,0,0,0,1,1,0,1,Dr_Confid
2389,7423,54,0,3,2,37.079560,0,4.735169,8.214064,7.483521,...,1.685962,3.346877,1,0,1,1,0,1,1,Dr_Confid
2390,7424,46,1,0,2,23.444712,0,9.672637,7.362861,6.717272,...,3.481549,1.713274,0,1,1,0,1,1,0,Dr_Confid


In [4]:
dataset.columns

Index(['PatientID', 'Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI',
       'Smoking', 'PhysicalActivity', 'DietQuality', 'SleepQuality',
       'PollutionExposure', 'PollenExposure', 'DustExposure', 'PetAllergy',
       'FamilyHistoryAsthma', 'HistoryOfAllergies', 'Eczema', 'HayFever',
       'GastroesophagealReflux', 'LungFunctionFEV1', 'LungFunctionFVC',
       'Wheezing', 'ShortnessOfBreath', 'ChestTightness', 'Coughing',
       'NighttimeSymptoms', 'ExerciseInduced', 'Diagnosis', 'DoctorInCharge'],
      dtype='object')

In [5]:
# Display the initial number of rows
print(f"Initial number of rows: {dataset.shape[0]}")

# Remove duplicate rows
dataset_cleaned = dataset.drop_duplicates()

Initial number of rows: 2392


In [6]:
# Display the number of rows after removing duplicates
print(f"Number of rows after removing duplicates: {dataset_cleaned.shape[0]}")

# Optionally, reset the index if needed
dataset_cleaned.reset_index(drop=True, inplace=True)

Number of rows after removing duplicates: 2392


In [7]:
columns_to_drop=['PatientID','Ethnicity','EducationLevel','PhysicalActivity','GastroesophagealReflux','PhysicalActivity',
                 'DietQuality','SleepQuality','PollenExposure','HistoryOfAllergies','NighttimeSymptoms','ExerciseInduced',
                 'Eczema','HayFever','LungFunctionFVC','DoctorInCharge']
dataset=dataset.drop(columns=columns_to_drop)

In [8]:
indep_X = df2.drop(['PatientID','Ethnicity','EducationLevel','PhysicalActivity','GastroesophagealReflux','PhysicalActivity',
                    'DietQuality','SleepQuality','PollenExposure','HistoryOfAllergies','NighttimeSymptoms','ExerciseInduced',
                   'Eczema','HayFever','LungFunctionFVC'],axis=1)
dep_Y = df2['Diagnosis']
dataset

Unnamed: 0,Age,Gender,BMI,Smoking,PollutionExposure,DustExposure,PetAllergy,FamilyHistoryAsthma,LungFunctionFEV1,Wheezing,ShortnessOfBreath,ChestTightness,Coughing,Diagnosis
0,63,0,15.848744,0,7.388481,0.974339,1,1,1.369051,0,0,1,0,0
1,26,1,22.757042,0,1.969838,6.584631,0,0,2.197767,1,0,0,1,0
2,57,0,18.395396,0,1.460593,5.445799,0,1,1.698011,1,1,1,0,0
3,40,1,38.515278,0,0.581905,3.965316,0,0,3.032037,1,0,1,1,0
4,61,0,19.283802,0,0.980875,8.260605,0,0,3.470589,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,43,1,29.059613,0,2.483829,3.425445,1,0,3.125249,0,1,0,0,1
2388,18,1,20.740850,0,7.733983,6.467701,0,0,1.132977,0,0,0,1,1
2389,54,0,37.079560,0,2.794847,9.484013,0,0,1.685962,1,0,1,1,1
2390,46,1,23.444712,0,9.448862,5.051405,0,1,3.481549,0,1,1,0,0


In [9]:
def split_scalar(indep_X,dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)    
    return X_train, X_test, y_train, y_test

In [10]:
def r2_prediction(regressor,X_test,y_test):
    y_pred = regressor.predict(X_test)
    from sklearn.metrics import r2_score
    r2=r2_score(y_test,y_pred)
    return r2

In [11]:
def Linear(X_train,y_train,X_test):       
    from sklearn.linear_model import LinearRegression
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    r2=r2_prediction(regressor,X_test,y_test)
    return  r2 

In [12]:
def Decision(X_train,y_train,X_test):
    from sklearn.tree import DecisionTreeRegressor
    regressor = DecisionTreeRegressor(random_state = 0)
    regressor.fit(X_train, y_train)
    r2=r2_prediction(regressor,X_test,y_test)
    return  r2 

In [13]:
def random(X_train,y_train,X_test):       
    from sklearn.ensemble import RandomForestRegressor
    regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
    regressor.fit(X_train, y_train)
    r2=r2_prediction(regressor,X_test,y_test)
    return  r2

In [14]:
def xgboost(X_train,y_train,X_test):       
    from xgboost import XGBRegressor
    regressor = XGBRegressor(n_jobs=5,learning_rate=0.1,max_depth=10,random_state=1)
    regressor.fit(X_train, y_train)
    r2=r2_prediction(regressor,X_test,y_test)
    return r2

In [15]:
def rfeFeature(indep_X, dep_Y, n):
    rfelist = []
    colnames_list = []  
    # List to store column names for each model
    r2_values = []  
    # List to store R2 values for each model

    from sklearn.linear_model import LinearRegression
    lin = LinearRegression()

    from sklearn.tree import DecisionTreeRegressor
    dec = DecisionTreeRegressor(random_state=0)

    from sklearn.ensemble import RandomForestRegressor
    rf = RandomForestRegressor(n_estimators=10, random_state=0)

    from xgboost import XGBRegressor
    xgb = XGBRegressor(n_jobs=5, learning_rate=0.1, max_depth=10, random_state=1)

    rfemodellist = [lin, dec, rf, xgb]

    for model in rfemodellist:
        log_rfe = RFE(estimator=model, n_features_to_select=n)
        log_fit = log_rfe.fit(indep_X, dep_Y)
        log_rfe_feature = log_fit.transform(indep_X)
        rfelist.append(log_rfe_feature)

        # Get the column names selected by RFE
        selected_columns = [col for col, selected in zip(indep_X.columns, log_rfe.support_) if selected]
        colnames_list.append(selected_columns)

        # Fit the model and calculate and store the R2 value
        X_train, X_test, y_train, y_test = split_scalar(pd.DataFrame(log_rfe_feature), dep_Y)
        model.fit(X_train, y_train)  # Fit the model
        r2 = r2_prediction(model, X_test, y_test)
        r2_values.append(r2)

    return rfelist, colnames_list, r2_values

# Call the function with your data
rfelist, colnames_list, r2_values = rfeFeature(indep_X, dep_Y, 5)

# Print the selected column names and R2 values for each model
for model_name, selected_columns, r2_value in zip(["Linear", "Decision", "Random", "XGBoost"], colnames_list, r2_values):
    print(f"Model: {model_name}")
    print("Selected Columns:", selected_columns)
    print(f"R2 Value: {r2_value}\n")

Model: Linear
Selected Columns: ['PollutionExposure', 'FamilyHistoryAsthma', 'Wheezing', 'ChestTightness', 'Diagnosis']
R2 Value: 1.0

Model: Decision
Selected Columns: ['Wheezing', 'ShortnessOfBreath', 'ChestTightness', 'Coughing', 'Diagnosis']
R2 Value: 1.0

Model: Random
Selected Columns: ['Wheezing', 'ShortnessOfBreath', 'ChestTightness', 'Coughing', 'Diagnosis']
R2 Value: 1.0

Model: XGBoost
Selected Columns: ['Wheezing', 'ShortnessOfBreath', 'ChestTightness', 'Coughing', 'Diagnosis']
R2 Value: 0.999999986390663



# Model creation

In [16]:
import pickle
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

# Assuming indep_X and dep_Y are predefined
selected_features = indep_X
target_variable = dep_Y

# Assuming indep_X and dep_Y are defined as your features and target variable
X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)

param_grid = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],'C': [0.1, 1, 10, 100]}

grid = GridSearchCV(SVR(), param_grid, refit=True, verbose=3, n_jobs=1)
grid.fit(X_train, y_train)

y_predict = grid.predict(X_test)

# Evaluate using classification metrics
#f1_macro = f1_score(y_test, y_predict, average='weighted')
#clf_report = classification_report(y_test, y_predict)

#print("Best parameters found by GridSearchCV:", grid.best_params_)
## Evaluate using regression metrics
mse = mean_squared_error(y_test, y_predict)
r2 = r2_score(y_test, y_predict)

print("Best parameters found by GridSearchCV:", grid.best_params_)
print("Mean Squared Error (MSE) on test set:", mse)
print("R-squared (R2) on test set:", r2)
#print("R-squared (R2) on test set:", r2)
#print("F1 macro score on test set:", f1_macro)
#print("Confusion Matrix:\n", cm)
#print("Classification Report:\n", clf_report)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END ..............C=0.1, kernel=linear;, score=0.840 total time=   0.0s
[CV 2/5] END ..............C=0.1, kernel=linear;, score=0.734 total time=   0.0s
[CV 3/5] END ..............C=0.1, kernel=linear;, score=0.819 total time=   0.0s
[CV 4/5] END ..............C=0.1, kernel=linear;, score=0.791 total time=   0.0s
[CV 5/5] END ..............C=0.1, kernel=linear;, score=0.811 total time=   0.0s
[CV 1/5] END ................C=0.1, kernel=poly;, score=0.002 total time=   0.0s
[CV 2/5] END ...............C=0.1, kernel=poly;, score=-0.069 total time=   0.0s
[CV 3/5] END ...............C=0.1, kernel=poly;, score=-0.011 total time=   0.0s
[CV 4/5] END ...............C=0.1, kernel=poly;, score=-0.036 total time=   0.0s
[CV 5/5] END ...............C=0.1, kernel=poly;, score=-0.009 total time=   0.0s
[CV 1/5] END ................C=0.1, kernel=rbf;, score=-0.010 total time=   0.0s
[CV 2/5] END ................C=0.1, kernel=rbf;,

Results:
Best parameters found by GridSearchCV: {'C': 100, 'kernel': 'rbf'}
Mean Squared Error (MSE) on test set: 0.006625519027861686
R-squared (R2) on test set: 0.8463182132425602

In [17]:
# Define your results as dictionaries
results = {
    "Parameter": ["C", "Kernel", "Mean Squared Error (MSE)", "R-squared (R2)"],
    "Value": [100, 'rbf', 0.006625519027861686, 0.8463182132425602]
}

# Create a DataFrame
results_df = pd.DataFrame(results)

# Print the DataFrame
print(results_df)

                  Parameter     Value
0                         C       100
1                    Kernel       rbf
2  Mean Squared Error (MSE)  0.006626
3            R-squared (R2)  0.846318


In [18]:
# Save the best model
filename = "SVR_Asthma_finalmodel.sav"
pickle.dump(grid, open(filename, 'wb'))

In [19]:
# Load the saved model
loaded_model = pickle.load(open(filename, 'rb'))

In [21]:
def get_reg_prediction(prediction):
    return "Yes" if prediction == 1 else "No"

# Example input values (replace these with your actual input mechanism)
Age= float(input("Age: "))
Gender = float(input("Gender (0 for Female, 1 for Male): "))
#Ethnicity=float(input("Ethnicity: "))
#EducationLevel=float(input("EducationLevel: "))
BMI=float(input("BMI: "))
Smoking = float(input("Smoking (0 or 1): "))
#PhysicalActivity=float(input("PhysicalActivity: "))
#DietQuality=float(input("DietQuality: "))
#SleepQuality=float(input("SleepQuality: "))
PollutionExposure=float(input("PollutionExposure: "))
#PollenExposure=float(input("PollenExposure: "))
DustExposure=float(input("DustExposure: "))
PetAllergy=float(input("PetAllergy: "))
FamilyHistoryAsthma = float(input("FamilyHistoryAsthma: "))
#HistoryOfAllergies=float(input("HistoryOfAllergies: "))
#Eczema=float(input("Eczema: "))
#HayFever=float(input("HayFever: "))
#GastroesophagealReflux=float(input("GastroesophagealReflux: "))
LungFunctionFEV1 = float(input("LungFunctionFEV1 (LF): "))
#LungFunctionFVC=float(input("LungFunctionFVC: "))
Wheezing=float(input("Wheezing:"))
ShortnessOfBreath=float(input("ShortnessOfBreath:"))
ChestTightness=float(input("ChestTightness: "))
Coughing=float(input("Coughing: "))
#NighttimeSymptoms=float(input("NighttimeSymptoms: "))
#ExerciseInduced=float(input("ExerciseInduced: "))
Diagnosis=float(input("Diagnosis: "))
    
# Assume you have loaded_model and it's already trained and loaded

# Predict the numerical outcome related to asthma
#future_prediction = loaded_model.predict([[Age,Gender,BMI,Smoking,PhysicalActivity,DietQuality,
                                           #SleepQuality,PollutionExposure,PollenExposure,DustExposure,PetAllergy,
                                           #FamilyHistoryAsthma,HistoryOfAllergies,Eczema,HayFever,GastroesophagealReflux,
                                           #LungFunctionFEV1,LungFunctionFVC,Wheezing,ShortnessOfBreath,ChestTightness, 
                                           #Coughing,NighttimeSymptoms,ExerciseInduced,Diagnosis]])
future_prediction = loaded_model.predict([[Age,Gender,BMI,Smoking,
                                           PollutionExposure,DustExposure,PetAllergy,FamilyHistoryAsthma,
                                           LungFunctionFEV1,Wheezing,ShortnessOfBreath,ChestTightness,Coughing,Diagnosis]])


Age: 55
Gender (0 for Female, 1 for Male): 1
BMI: 35.4
Smoking (0 or 1): 1
PollutionExposure: 2.8
DustExposure: 1.6
PetAllergy: 0
FamilyHistoryAsthma: 1
LungFunctionFEV1 (LF): 1.5
Wheezing:1
ShortnessOfBreath:0
ChestTightness: 1
Coughing: 1
Diagnosis: 1


  "X does not have valid feature names, but"


In [22]:
# Print or use the numerical prediction directly
print("Predicted outcome related to asthma:", (future_prediction[0])

Predicted outcome related to asthma: 0.9287762794369162


In [24]:
# Print the categorical prediction
future_prediction_reg=get_reg_prediction(future_prediction[0])
print("Future Prediction (Treatment Required):",future_prediction_reg)

Future Prediction (Treatment Required): No
