# Data Collection

In [7]:
import pandas as pd

# Load the training data
train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Fiverr/edcogan/training_data.csv')

train_data

Unnamed: 0.1,Unnamed: 0,gender,age,bmi,systolic_bp,diastolic_bp,exercise_frequency,smoker,family_history,diet_quality,us_state,shoe_size,cholesterol_level,has_disease
0,746,Female,63,26.051019,96.865801,74.879872,Regularly,0.0,0.0,Poor,PA,5.560992,208.921743,1
1,131,Male,51,28.999501,115.878612,113.660525,Regularly,1.0,0.0,Average,PA,12.098365,246.817639,1
2,1711,Female,76,21.135693,144.846268,132.404670,Rarely,0.0,0.0,Poor,FL,6.751833,232.486516,1
3,1534,Male,77,21.610457,,128.789122,Rarely,0.0,0.0,,MI,11.893455,273.670761,1
4,390,Female,68,22.311965,163.945914,84.559669,Frequently,1.0,0.0,Average,NY,6.983366,208.985119,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1269,Male,75,16.856116,,88.113863,Frequently,0.0,0.0,Average,NY,8.210186,124.637539,0
1996,1058,Female,25,26.081882,145.763525,107.253516,Never,1.0,0.0,Good,CA,11.920220,141.207955,0
1997,832,Male,30,23.937841,200.322362,,Regularly,1.0,0.0,Good,GA,10.852045,298.462605,0
1998,1129,Female,39,17.024249,,64.773888,Rarely,0.0,0.0,Average,GA,9.362450,168.020536,0


In [8]:
# drop the first column(irrelevant)
train_data.drop(train_data.columns[0], axis=1, inplace=True)
train_data

Unnamed: 0,gender,age,bmi,systolic_bp,diastolic_bp,exercise_frequency,smoker,family_history,diet_quality,us_state,shoe_size,cholesterol_level,has_disease
0,Female,63,26.051019,96.865801,74.879872,Regularly,0.0,0.0,Poor,PA,5.560992,208.921743,1
1,Male,51,28.999501,115.878612,113.660525,Regularly,1.0,0.0,Average,PA,12.098365,246.817639,1
2,Female,76,21.135693,144.846268,132.404670,Rarely,0.0,0.0,Poor,FL,6.751833,232.486516,1
3,Male,77,21.610457,,128.789122,Rarely,0.0,0.0,,MI,11.893455,273.670761,1
4,Female,68,22.311965,163.945914,84.559669,Frequently,1.0,0.0,Average,NY,6.983366,208.985119,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,Male,75,16.856116,,88.113863,Frequently,0.0,0.0,Average,NY,8.210186,124.637539,0
1996,Female,25,26.081882,145.763525,107.253516,Never,1.0,0.0,Good,CA,11.920220,141.207955,0
1997,Male,30,23.937841,200.322362,,Regularly,1.0,0.0,Good,GA,10.852045,298.462605,0
1998,Female,39,17.024249,,64.773888,Rarely,0.0,0.0,Average,GA,9.362450,168.020536,0


# Preprocessing Function

In [9]:
from sklearn.preprocessing import StandardScaler

def preprocess_data(data):
    # Replace missing numerical values with the mean
    numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
    data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].mean())

    # Replace missing categorical values with the mode
    categorical_cols = data.select_dtypes(include=['object']).columns
    data[categorical_cols] = data[categorical_cols].fillna(data[categorical_cols].mode().iloc[0])

    # Standardize numerical features
    scaler = StandardScaler()
    data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

    # Perform one-hot encoding for categorical features
    data = pd.get_dummies(data, columns=categorical_cols)

    return data


In [10]:
# Preprocess the training data
train_data = preprocess_data(train_data)

# Display the preprocessed training data
train_data.head()

Unnamed: 0,age,bmi,systolic_bp,diastolic_bp,smoker,family_history,shoe_size,cholesterol_level,has_disease,gender_Female,...,us_state_CA,us_state_FL,us_state_GA,us_state_IL,us_state_MI,us_state_NC,us_state_NY,us_state_OH,us_state_PA,us_state_TX
0,0.778453,-0.05035,-1.24212,-0.737216,-1.038043,-1.045957,-1.420799,0.14494,1.0,1,...,0,0,0,0,0,0,0,0,1,0
1,0.117837,0.455689,-0.625599,1.162718,1.074569,-1.045957,1.213111,0.773448,1.0,0,...,0,0,0,0,0,0,0,0,1,0
2,1.49412,-0.893953,0.313724,2.081027,-1.038043,-1.045957,-0.941009,0.535765,1.0,1,...,0,1,0,0,0,0,0,0,0,0
3,1.549171,-0.812471,0.0,1.903895,-1.038043,-1.045957,1.130552,1.218811,1.0,0,...,0,0,0,0,1,0,0,0,0,0
4,1.053709,-0.692073,0.933061,-0.262985,1.074569,-1.045957,-0.847724,0.145991,1.0,1,...,0,0,0,0,0,0,1,0,0,0




**Logistic Regression**

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# Split the data into features and target
X = train_data.drop('has_disease', axis=1)
y = train_data['has_disease']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression classifier
clf = LogisticRegression()

# Fit the model on the training data
clf.fit(X_train, y_train)

# Make predictions on the validation set
val_predictions = clf.predict(X_val)

# Calculate the F1 score
f1 = f1_score(y_val, val_predictions)

print("Logistic Regression F1 Score:", f1)


Logistic Regression F1 Score: 0.8379052369077307


**SGDClassifier**

In [17]:
# prompt: create SGDClassifier model like above

from sklearn.linear_model import SGDClassifier

# Initialize the SGDClassifier model
clf = SGDClassifier()

# Fit the model on the training data
clf.fit(X_train, y_train)

# Make predictions on the validation set
val_predictions = clf.predict(X_val)

# Calculate the F1 score
f1 = f1_score(y_val, val_predictions)

print("SGDClassifier F1 Score:", f1)


SGDClassifier F1 Score: 0.7939698492462312


**Random Forest**

In [13]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest classifier
clf = RandomForestClassifier()

# Fit the model on the training data
clf.fit(X_train, y_train)

# Make predictions on the validation set
val_predictions = clf.predict(X_val)

# Calculate the F1 score
f1 = f1_score(y_val, val_predictions)

print("Random ForestF1 Score:", f1)

Random ForestF1 Score: 0.8701923076923077


In [16]:
# Load the test data
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Fiverr/edcogan/test_data.csv')

test_data.drop(test_data.columns[[0,1]], axis=1, inplace=True)

# Preprocess the test data
test_data = preprocess_data(test_data)

# Make predictions on the test data
test_predictions = clf.predict(test_data)

# Save the predictions to a CSV file
pd.DataFrame(test_predictions, columns=['has_disease']).to_csv('/content/drive/MyDrive/Colab Notebooks/Fiverr/edcogan/answers.csv', index=False)
