In [37]:
# import statements
import pandas as pd
from datetime import time
import numpy as np

In [38]:
# defining data for each csv file
patient_info=pd.read_csv('AsthmaFiles/patient_info.csv')
smartwatch1=pd.read_csv('AsthmaFiles/smartwatch1.csv')
smartwatch2=pd.read_csv('AsthmaFiles/smartwatch2.csv')
smartwatch3=pd.read_csv('AsthmaFiles/smartwatch3.csv')

In [39]:
#holds the list of user keys
list_of_user_keys=[]
#Cleaning data by removing patients with insignificant data
for x in patient_info[["user_key"]+list(patient_info)[-10:-2]].iterrows():
    if x[1]["pef_end_date"] - x[1]["pef_start_date"] >= 50 and x[1]["miband_end_date"]!="NaN":
        # if user fits significant data criteria, add user key to list
        list_of_user_keys.append(x[1]["user_key"])
list_of_user_keys

[190.0, 294.0, 343.0, 447.0, 473.0, 514.0, 625.0, 702.0, 808.0, 939.0]

In [40]:
#Gets peakflow data related to user key
def get_peakflow_data(user_key):
    peakflow_data = pd.read_csv("AsthmaFiles/peakflow.csv")
    peakflow_data = peakflow_data[peakflow_data["user_key"]==user_key]
    peakflow_data = peakflow_data[["date","hour","pef_max"]]
    return peakflow_data


In [41]:
#Gets Enviornmental data related to user key and dates their peakflow was recorded
def pair_weather(id,dates):
    weather = pd.read_csv("AsthmaFiles/environment.csv")
    for_id = weather.loc[weather['user_key'] == id]
    weather = for_id.loc[for_id['date'].isin(dates)]

    return weather

In [42]:
#try_catch to ensure no index errors crash the program and when they did a default value will be returned
def try_catch(row,default,weather,x):
    try:
        return weather.loc[weather['date'] == row["date"]].iloc[0][x]
    except IndexError:
        return default

In [43]:
#sepereates data for each user key among the 3 smartwatch files (sorting through 2.5 million datapoints is why it takes a while)
def seperate_for_key(user_key,smartwatch1,smartwatch2,smartwatch3):
    seperate_for_key1 = smartwatch1.loc[smartwatch1["user_key"]==user_key][["date","time","hr"]]
    seperate_for_key2 = smartwatch2.loc[smartwatch2["user_key"]==user_key][["date","time","hr"]]
    seperate_for_key3 = smartwatch3.loc[smartwatch3["user_key"]==user_key][["date","time","hr"]]
    frames = [seperate_for_key1, seperate_for_key2, seperate_for_key3]
    return pd.concat(frames)

#Parses through data for specific date given
def seperate_for_date(date,seperate_for_key):
    return seperate_for_key.loc[seperate_for_key["date"]==date][["time","hr"]]

#parse through data for specific time given
def seperate_for_time(time,seperate_for_date):
    if len(str(time)) == 1:
        time = "0" + str(time)
    return seperate_for_date.loc[seperate_for_date["time"].str.startswith(str(time))]["hr"].max()

#lambda method to be used in apply method that will set the value of rows in the peakflow dataframe based on date and time ranges
def lambda_method(row,key,sm1,sm2,sm3):
    sk = seperate_for_key(key,sm1,sm2,sm3)
    sd = seperate_for_date(row["date"],sk)
    return seperate_for_time(row["hour"],sd)



In [44]:
# default value is set to NA or empty in other words
default = "NA"
# traverse through all viable users
for i in list_of_user_keys:
    # gets peak flow data for user
    peak_flow_data = get_peakflow_data(i)
    # pairs weather data for user based on dates peak flow was recorded
    weather = pair_weather(i,peak_flow_data["date"])
    # adds weather data to peak flow data
    for x in list(weather.columns):
        #skips repeating columns
        if x not in peak_flow_data.columns:
            # appends rows to peak flow data
            peak_flow_data[x] = peak_flow_data.apply(lambda row: try_catch(row,default,weather,x), axis = 1)
    # adds heart rate data to peak flow data
    peak_flow_data["hr"] = peak_flow_data.apply(lambda row: lambda_method(row,i,smartwatch1,smartwatch2,smartwatch3), axis = 1)
    # saves data to csv file according to user key
    peak_flow_data.to_csv(f"AsthmaFiles/{int(i)}.csv",index=False)

In [45]:
# cleans data by removing rows with NaN values
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
    return df[indices_to_keep].astype(np.float64)

# reads all csv files and merges them into one to parse through
def read_all_and_merge():
    x = pd.DataFrame()
    for i in list_of_user_keys:
        dataset = pd.read_csv(f"AsthmaFiles/{int(i)}.csv")
        dataset = dataset.drop(["weed_pollen","tree_pollen","grass_pollen"],axis=1)
        dataset = clean_dataset(dataset)
        max_for_user = dataset["pef_max"].max()
        dataset["pef_max"] = dataset["pef_max"].apply(lambda x: x/int(max_for_user))
        dataset = dataset.drop(["date","hour","user_key","co","no","no2","o3","so2","nh3","pressure","pm2_5","pm10"],axis=1)
        x = pd.concat([x,dataset])

    return x

# saves merged data to csv file
x = read_all_and_merge()
x.to_csv("AsthmaFiles/merged.csv",index=False)
y = x["pef_max"]
x = x.drop(["pef_max"],axis=1)

In [46]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=83)


In [47]:
x_test

Unnamed: 0,temperature,temperature_min,temperature_max,humidity,wind_speed,wind_deg,aqi,hr
88,11.72,10.44,13.21,81.0,9.70,193.0,1.0,87.0
161,8.24,6.62,9.84,87.0,6.69,34.0,2.0,87.0
14,15.89,13.98,18.60,78.0,0.01,243.0,1.0,74.0
61,6.64,3.05,11.55,93.0,3.65,272.0,1.0,99.0
122,13.69,12.72,15.48,86.0,2.74,207.0,1.0,95.0
...,...,...,...,...,...,...,...,...
4,20.93,18.17,23.06,67.0,2.06,320.0,2.0,93.0
44,8.77,7.78,9.87,96.0,1.30,201.0,1.0,101.0
22,17.12,16.07,18.05,85.0,3.26,88.0,2.0,115.0
6,15.27,14.14,15.90,85.0,2.72,127.0,1.0,90.0


In [48]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train,y_train)


In [49]:

y_pred = model.predict(x_test)
y_pred

array([0.75033577, 0.75501431, 0.84988594, 0.75709702, 0.81256831,
       0.76165962, 0.73083954, 0.82909326, 0.87245343, 0.79584571,
       0.83628913, 0.81728749, 0.74872127, 0.77585257, 0.85019063,
       0.76855115, 0.8166401 , 0.77159752, 0.80107322, 0.78801668,
       0.78104155, 0.76917888, 0.78804431, 0.80564301, 0.80714207,
       0.79977934, 0.75226341, 0.76401842, 0.80285174, 0.81558725,
       0.7543581 , 0.75995545, 0.79983422, 0.69433883, 0.81481897,
       0.80784204, 0.78932795, 0.80896833, 0.82710697, 0.77057405,
       0.79157231, 0.78154018, 0.79461668, 0.69085096, 0.80561584,
       0.89642323, 0.77191662, 0.80260885, 0.7844903 , 0.75420123,
       0.73499081, 0.83570187, 0.81688941, 0.79536335, 0.76213223,
       0.84340178, 0.72128486, 0.83596348, 0.77584495, 0.72905893,
       0.77416042, 0.84459697, 0.78332476, 0.82510163, 0.81101646,
       0.68929903, 0.74718432, 0.83013177, 0.831612  , 0.83085147,
       0.76372248, 0.82699493, 0.82368653, 0.81372596, 0.77384

In [50]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Assuming you have y_pred and y_test as NumPy arrays or Pandas Series
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared (R^2) Score: {r2}')



Mean Absolute Error: 0.07652298001461354
Mean Squared Error: 0.009208251469389743
Root Mean Squared Error: 0.09595963458345255
R-squared (R^2) Score: 0.09525702956925175


In [51]:
import joblib
joblib.dump(model, 'asthma_model.joblib')

['asthma_model.joblib']