# Project 3 Code

## Importing Datasets

In [22]:
import pandas as pd

diabetes_data = pd.read_csv('diabetes.csv')
diabetes_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Checking missing data

In [23]:
missing_values = (diabetes_data == 0).sum()
missing_values = missing_values.drop(['Pregnancies', 'Outcome'])
missing_values

Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
dtype: int64

## Imputation

In [24]:
# nil/zero values w/ the median for relevant columns---
diabetes_data['Glucose'] = diabetes_data['Glucose'].replace(0, diabetes_data['Glucose'].median())
diabetes_data['BloodPressure'] = diabetes_data['BloodPressure'].replace(0, diabetes_data['BloodPressure'].median())
diabetes_data['SkinThickness'] = diabetes_data['SkinThickness'].replace(0, diabetes_data['SkinThickness'].median())
diabetes_data['Insulin'] = diabetes_data['Insulin'].replace(0, diabetes_data['Insulin'].median())
diabetes_data['BMI'] = diabetes_data['BMI'].replace(0, diabetes_data['BMI'].median())

In [25]:
(diabetes_data == 0).sum()

Pregnancies                 111
Glucose                       0
BloodPressure                 0
SkinThickness                 0
Insulin                       0
BMI                           0
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64

## Setting up Linear Regression Model

### Importing libs

In [26]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

### Splitting the dataset into features & target variable

In [27]:
X = diabetes_data.drop(columns='Outcome') # features
y = diabetes_data['Outcome'] # target variable

### Split the dataset into training and testing sets (80/20 split)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Create model, train, and predict

In [29]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

### Evaluate

In [30]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse

0.41191058593910673

## Experiment 2

### Importing new libraries

In [31]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

### Creating Polynomial Features

In [32]:
diabetes_data['Glucose_sq'] = diabetes_data['Glucose'] ** 2
diabetes_data['BMI_sq'] = diabetes_data['BMI'] ** 2

### Features & target variables

In [33]:
X = diabetes_data.drop(columns='Outcome')
y = diabetes_data['Outcome']

### 80/20 split again

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Scale features

In [35]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Define Ridge Regression model with GridSearch for best alpha

In [36]:
ridge = Ridge()
parameters = {'alpha': [0.1, 1.0, 10.0, 100.0]}
grid_search = GridSearchCV(ridge, parameters, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train_scaled, y_train)

In [37]:
best_ridge = grid_search.best_estimator_

### Make predictions and evaluate

In [38]:
y_pred = best_ridge.predict(X_test_scaled)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

### Best alpha parameter

In [39]:
grid_search.best_params_

{'alpha': 100.0}

### RMSE for Ridge Regression w/ Polynomial Features

In [40]:
rmse

0.3939423689595456