In [12]:
# Question 1

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.metrics import roc_auc_score
from scipy.special import expit
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
diabetes_data = pd.read_csv('diabetes.csv')


X = diabetes_data.drop('Diabetes', axis=1)
y = diabetes_data['Diabetes']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

perceptron = Perceptron(tol=1e-3, random_state=42)
perceptron.fit(X_train_scaled, y_train)

y_scores = expit(perceptron.decision_function(X_test_scaled))
auc_score = roc_auc_score(y_test, y_scores)

print("AUC Score:", auc_score)

AUC Score: 0.701358524956705


In [3]:
# Question 2

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd


data = pd.read_csv('diabetes.csv')

X = data.drop('Diabetes', axis=1) 
y = data['Diabetes']


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


configurations = [
    (1, 'identity'),   
    (1, 'relu'),       
    (1, 'logistic'),   
    (2, 'identity'),   
    (2, 'relu'),       
    (2, 'logistic'),   
    (3, 'identity'),  
    (3, 'relu'),      
    (3, 'logistic')    
]

results = {}

for layers, activation in configurations:
 
    hidden_layer_sizes = tuple([50] * layers)  
    mlp = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, max_iter=500, random_state=42)
    mlp.fit(X_train, y_train)
    
   
    if activation == 'identity':  
        try:
            y_scores = mlp.decision_function(X_test)
        except AttributeError:
            y_scores = mlp.predict_proba(X_test)[:, 1]
    else:
        y_scores = mlp.predict_proba(X_test)[:, 1]
    
    auc = roc_auc_score(y_test, y_scores)
    results[(layers, activation)] = auc

print("AUC scores by configuration:")
for config, auc in results.items():
    print(f"Layers: {config[0]}, Activation: {config[1]}, AUC: {auc:.3f}")

AUC scores by configuration:
Layers: 1, Activation: identity, AUC: 0.824
Layers: 1, Activation: relu, AUC: 0.832
Layers: 1, Activation: logistic, AUC: 0.834
Layers: 2, Activation: identity, AUC: 0.825
Layers: 2, Activation: relu, AUC: 0.820
Layers: 2, Activation: logistic, AUC: 0.833
Layers: 3, Activation: identity, AUC: 0.825
Layers: 3, Activation: relu, AUC: 0.801
Layers: 3, Activation: logistic, AUC: 0.832


In [17]:
# Question 3

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import roc_auc_score


data = pd.read_csv('diabetes.csv')
X = data.drop('Diabetes', axis=1)
y = data['Diabetes']


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = Sequential([
    Dense(128, input_shape=(X_train.shape[1],), activation='relu'),
    Dense(64, activation='relu'),
    Dense(64, activation='relu'),  
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=50, batch_size=10, verbose=1)


y_pred_proba = model.predict(X_test).ravel()
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f'The AUC score of the deep network is: {auc_score}')


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m20295/20295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 320us/step - accuracy: 0.8614 - loss: 0.3256
Epoch 2/50
[1m20295/20295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 314us/step - accuracy: 0.8665 - loss: 0.3164
Epoch 3/50
[1m20295/20295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 314us/step - accuracy: 0.8675 - loss: 0.3132
Epoch 4/50
[1m20295/20295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 315us/step - accuracy: 0.8673 - loss: 0.3131
Epoch 5/50
[1m20295/20295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 318us/step - accuracy: 0.8671 - loss: 0.3126
Epoch 6/50
[1m20295/20295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 315us/step - accuracy: 0.8683 - loss: 0.3115
Epoch 7/50
[1m20295/20295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 323us/step - accuracy: 0.8679 - loss: 0.3130
Epoch 8/50
[1m20295/20295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 320us/step - accuracy: 0.8672 - loss: 0.3144
Epo

In [2]:
# Question 4

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import mean_squared_error


data = pd.read_csv('diabetes.csv')
X = data.drop('BMI', axis=1)  
y = data['BMI']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


def create_model(activation_function):
    
    if activation_function == 'identity':
        activation_function = None  
    elif activation_function == 'logistic':
        activation_function = 'sigmoid'  
    
    model = Sequential([
        Dense(64, input_shape=(X_train_scaled.shape[1],), activation=activation_function),
        Dense(1)  
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

activations = ['identity', 'logistic', 'relu', 'tanh'] 


results = []
for activation in activations:
    print(f"Training with {activation} activation...")
    model = create_model(activation)
    model.fit(X_train_scaled, y_train, epochs=50, batch_size=10, verbose=0)
    y_pred = model.predict(X_test_scaled)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    results.append((activation, rmse))
    print(f"Activation: {activation}, RMSE: {rmse}")

for result in results:
    print(f"Activation: {result[0]}, RMSE: {result[1]:.4f}")


Training with identity activation...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1586/1586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175us/step
Activation: identity, RMSE: 6.1389578283174675
Training with logistic activation...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1586/1586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175us/step
Activation: logistic, RMSE: 5.990831278213405
Training with relu activation...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1586/1586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170us/step
Activation: relu, RMSE: 6.010831566111916
Training with tanh activation...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1586/1586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173us/step
Activation: tanh, RMSE: 6.00617925214004
Activation: identity, RMSE: 6.1390
Activation: logistic, RMSE: 5.9908
Activation: relu, RMSE: 6.0108
Activation: tanh, RMSE: 6.0062


In [8]:
# Question 5

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2
from sklearn.metrics import mean_squared_error


data = pd.read_csv('diabetes.csv')
X = data.drop('BMI', axis=1)  
y = data['BMI']


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


model = Sequential([
    Dense(128, input_shape=(X_train.shape[1],), activation='relu'),
    Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(1)
])
model.compile(optimizer='adam', loss='mean_squared_error')


history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_val, y_val), verbose=1)


y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Test RMSE: {rmse}")


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5550/5550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 375us/step - loss: 83.7034 - val_loss: 73.0205
Epoch 2/100
[1m5550/5550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 367us/step - loss: 40.3507 - val_loss: 71.1038
Epoch 3/100
[1m5550/5550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 373us/step - loss: 37.5056 - val_loss: 55.5946
Epoch 4/100
[1m5550/5550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 374us/step - loss: 36.6519 - val_loss: 52.0254
Epoch 5/100
[1m5550/5550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 370us/step - loss: 37.0007 - val_loss: 48.0620
Epoch 6/100
[1m5550/5550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 364us/step - loss: 36.5506 - val_loss: 43.1296
Epoch 7/100
[1m5550/5550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 362us/step - loss: 36.3429 - val_loss: 44.7275
Epoch 8/100
[1m5550/5550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 370us/step - loss: 36.5806 - val_loss: 45.7

In [11]:
pip install scikeras

Collecting scikeras
  Obtaining dependency information for scikeras from https://files.pythonhosted.org/packages/ea/09/1c02aa24daf7a003c06f629fbb69dc9ae1bda1b247d7b8981e550d752ac9/scikeras-0.13.0-py3-none-any.whl.metadata
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Collecting scikit-learn>=1.4.2 (from scikeras)
  Obtaining dependency information for scikit-learn>=1.4.2 from https://files.pythonhosted.org/packages/f2/30/1299e84d2ba3bc735baf17cebbf5b9d55144243c41b3ec6559ce3cf61e23/scikit_learn-1.4.2-cp311-cp311-macosx_12_0_arm64.whl.metadata
  Downloading scikit_learn-1.4.2-cp311-cp311-macosx_12_0_arm64.whl.metadata (11 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Downloading scikit_learn-1.4.2-cp311-cp311-macosx_12_0_arm64.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: scikit-learn, scikeras
  Attempting uninstall: 

In [14]:
# Extra Credit 1

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import mean_squared_error

data = pd.read_csv('diabetes.csv')
X = data.drop('BMI', axis=1) 
y = data['BMI']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(1)
])
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train_scaled, y_train, epochs=100, batch_size=10, verbose=1)


def calculate_rmse(predictions, targets):
    return np.sqrt(mean_squared_error(predictions, targets))


original_pred = model.predict(X_test_scaled)
original_rmse = calculate_rmse(original_pred, y_test)


importance_scores = {}
for i, col in enumerate(X.columns):
    X_test_permuted = X_test_scaled.copy()
    np.random.shuffle(X_test_permuted[:, i])  
    permuted_pred = model.predict(X_test_permuted)
    permuted_rmse = calculate_rmse(permuted_pred, y_test)
    importance_scores[col] = original_rmse - permuted_rmse  

importance_df = pd.DataFrame(list(importance_scores.items()), columns=['Feature', 'Importance'])
print(importance_df.sort_values(by='Importance', ascending=False))


Epoch 1/100
[1m    1/20295[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m57:49[0m 171ms/step - loss: 816.5577

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m20295/20295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 194us/step - loss: 88.5131
Epoch 2/100
[1m20295/20295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 196us/step - loss: 36.7672
Epoch 3/100
[1m20295/20295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 199us/step - loss: 35.9431
Epoch 4/100
[1m20295/20295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 193us/step - loss: 36.1352
Epoch 5/100
[1m20295/20295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 191us/step - loss: 36.3920
Epoch 6/100
[1m20295/20295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 193us/step - loss: 36.1288
Epoch 7/100
[1m20295/20295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 193us/step - loss: 36.3370
Epoch 8/100
[1m20295/20295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 194us/step - loss: 36.0112
Epoch 9/100
[1m20295/20295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 194us/step - loss: 35.2420
Epoch 10/100
[1m20295/20295[0

In [21]:
# Extra Credit 2

import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Generate a synthetic dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=2, n_redundant=0, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Neural Network
nn_model = Sequential([
    Dense(10, activation='relu', input_dim=20),
    Dense(1, activation='sigmoid')
])
nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train, y_train, epochs=50, batch_size=10, verbose=0)

# Other models
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)

forest_model = RandomForestClassifier()
forest_model.fit(X_train, y_train)

gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)

# Evaluate the models
models = {'Neural Network': nn_model, 'Logistic Regression': lr_model, 'Decision Tree': tree_model, 
          'Random Forest': forest_model, 'Gradient Boosting': gb_model}
for name, model in models.items():
    if name == 'Neural Network':
        loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
        print(f'{name} Accuracy: {accuracy:.4f}')
    else:
        predictions = model.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        print(f'{name} Accuracy: {accuracy:.4f}')


ImportError: cannot import name 'pd_fillna' from 'sklearn.utils.fixes' (/Users/khushiagarwal/anaconda3/lib/python3.11/site-packages/sklearn/utils/fixes.py)