In [2]:
# import statements
import pandas as pd
from datetime import time
import numpy as np

In [3]:
# defining data for each csv file
patient_info=pd.read_csv('AsthmaFiles/patient_info.csv')
smartwatch1=pd.read_csv('AsthmaFiles/smartwatch1.csv')
smartwatch2=pd.read_csv('AsthmaFiles/smartwatch2.csv')
smartwatch3=pd.read_csv('AsthmaFiles/smartwatch3.csv')

In [4]:
#holds the list of user keys
list_of_user_keys=[]
#Cleaning data by removing patients with insignificant data
for x in patient_info[["user_key"]+list(patient_info)[-10:-2]].iterrows():
    if x[1]["pef_end_date"] - x[1]["pef_start_date"] >= 50 and x[1]["miband_end_date"]!="NaN":
        # if user fits significant data criteria, add user key to list
        list_of_user_keys.append(x[1]["user_key"])
list_of_user_keys

[190.0, 294.0, 343.0, 447.0, 473.0, 514.0, 625.0, 702.0, 808.0, 939.0]

In [5]:
#Gets peakflow data related to user key
def get_peakflow_data(user_key):
    peakflow_data = pd.read_csv("AsthmaFiles/peakflow.csv")
    peakflow_data = peakflow_data[peakflow_data["user_key"]==user_key]
    peakflow_data = peakflow_data[["date","hour","pef_max"]]
    return peakflow_data


In [6]:
#Gets Enviornmental data related to user key and dates their peakflow was recorded
def pair_weather(id,dates):
    weather = pd.read_csv("AsthmaFiles/environment.csv")
    for_id = weather.loc[weather['user_key'] == id]
    weather = for_id.loc[for_id['date'].isin(dates)]

    return weather

In [7]:
#try_catch to ensure no index errors crash the program and when they did a default value will be returned
def try_catch(row,default,weather,x):
    try:
        return weather.loc[weather['date'] == row["date"]].iloc[0][x]
    except IndexError:
        return default

In [8]:
#sepereates data for each user key among the 3 smartwatch files (sorting through 2.5 million datapoints is why it takes a while)
def seperate_for_key(user_key,smartwatch1,smartwatch2,smartwatch3):
    seperate_for_key1 = smartwatch1.loc[smartwatch1["user_key"]==user_key][["date","time","hr"]]
    seperate_for_key2 = smartwatch2.loc[smartwatch2["user_key"]==user_key][["date","time","hr"]]
    seperate_for_key3 = smartwatch3.loc[smartwatch3["user_key"]==user_key][["date","time","hr"]]
    frames = [seperate_for_key1, seperate_for_key2, seperate_for_key3]
    return pd.concat(frames)

#Parses through data for specific date given
def seperate_for_date(date,seperate_for_key):
    return seperate_for_key.loc[seperate_for_key["date"]==date][["time","hr"]]

#parse through data for specific time given
def seperate_for_time(time,seperate_for_date):
    if len(str(time)) == 1:
        time = "0" + str(time)
    return seperate_for_date.loc[seperate_for_date["time"].str.startswith(str(time))]["hr"].max()

#lambda method to be used in apply method that will set the value of rows in the peakflow dataframe based on date and time ranges
def lambda_method(row,key,sm1,sm2,sm3):
    sk = seperate_for_key(key,sm1,sm2,sm3)
    sd = seperate_for_date(row["date"],sk)
    return seperate_for_time(row["hour"],sd)



In [9]:
# default value is set to NA or empty in other words
default = "NA"
# traverse through all viable users
for i in list_of_user_keys:
    # gets peak flow data for user
    peak_flow_data = get_peakflow_data(i)
    # pairs weather data for user based on dates peak flow was recorded
    weather = pair_weather(i,peak_flow_data["date"])
    # adds weather data to peak flow data
    for x in list(weather.columns):
        #skips repeating columns
        if x not in peak_flow_data.columns:
            # appends rows to peak flow data
            peak_flow_data[x] = peak_flow_data.apply(lambda row: try_catch(row,default,weather,x), axis = 1)
    # adds heart rate data to peak flow data
    peak_flow_data["hr"] = peak_flow_data.apply(lambda row: lambda_method(row,i,smartwatch1,smartwatch2,smartwatch3), axis = 1)
    # saves data to csv file according to user key
    peak_flow_data.to_csv(f"AsthmaFiles/{int(i)}.csv",index=False)

In [10]:
# cleans data by removing rows with NaN values
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
    return df[indices_to_keep].astype(np.float64)

# reads all csv files and merges them into one to parse through
def read_all_and_merge():
    x = pd.DataFrame()
    for i in list_of_user_keys:
        user_info= pd.read_csv(f"AsthmaFiles/patient_info.csv")
        dataset = pd.read_csv(f"AsthmaFiles/{int(i)}.csv")
        dataset = dataset.drop(["weed_pollen","tree_pollen","grass_pollen"],axis=1)
        dataset = clean_dataset(dataset)
        ex_for_user = user_info.loc[user_info["user_key"]==i]["max_pef_expected"].iloc[0]
        max_for_user = dataset["pef_max"].max()
        dataset["pef_max"] = dataset["pef_max"].apply(lambda x: x/int(ex_for_user))
        dataset = dataset.drop(["date","user_key","no"],axis=1)
        x = pd.concat([x,dataset])

    return x

# saves merged data to csv file
x = read_all_and_merge()
x.to_csv("AsthmaFiles/merged.csv",index=False)
y = x["pef_max"]
x = x.drop(["pef_max"],axis=1)


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
count = 0
sum=0
rmsesum=0
for i in range(0,1000):
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=i)
    model = LinearRegression()
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

    # Assuming you have y_pred and y_test as NumPy arrays or Pandas Series
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # print(f'Mean Absolute Error: {mae}')
    # print(f'Mean Squared Error: {mse}')
    # print(f'Root Mean Squared Error: {rmse}')
    if r2 > .4:
        count+=1
    sum+=r2
    rmsesum+=mse
    # print(f'R-squared (R^2) Score: {r2}')
print(count)
print(sum/1000)
print(rmsesum/1000)



4
0.32020797938095946
0.020879608675146183


In [12]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train,y_train)


In [13]:

y_pred = model.predict(x_test)
y_pred

array([0.61762476, 0.80302619, 0.62527852, 0.77320301, 0.76113626,
       0.74229623, 0.83859294, 0.6779068 , 0.71968808, 0.83048583,
       0.79876329, 0.77564022, 0.71726375, 0.83573965, 0.64811305,
       0.71433108, 0.90764788, 0.7790809 , 0.69204145, 0.91870132,
       0.70692195, 0.66134942, 0.89732907, 0.90031403, 0.73853525,
       0.79903315, 0.96511499, 0.73331631, 0.82178252, 0.65385588,
       0.97809578, 0.86642223, 0.80721661, 0.68548559, 0.68239394,
       0.5982147 , 0.60904087, 0.86967792, 0.82042763, 0.68382233,
       0.79955133, 0.80627298, 0.8626572 , 0.58488416, 0.84766703,
       0.70057144, 0.69062053, 0.8377277 , 1.07266012, 0.83716335,
       0.89631097, 0.692477  , 0.68034572, 0.83359794, 0.88670702,
       0.64945643, 0.75115092, 0.73624341, 0.78669263, 0.84295113,
       0.72941212, 0.65682314, 0.88679725, 0.78611866, 0.61335294,
       0.72776422, 0.72522248, 0.68003229, 0.77825202, 0.94448613,
       0.64139709, 0.82756546, 0.70547824, 0.92128168, 0.62376

In [14]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Assuming you have y_pred and y_test as NumPy arrays or Pandas Series
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared (R^2) Score: {r2}')



Mean Absolute Error: 0.11665949089277951
Mean Squared Error: 0.0197223705482071
Root Mean Squared Error: 0.14043635764362125
R-squared (R^2) Score: 0.3184409457841254


In [15]:
import joblib
joblib.dump(model, 'asthma_model.joblib')

['asthma_model.joblib']