In [10]:
# Import necessary libraries
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [3]:
df = pd.read_csv("/content/diabetes_no_zeros.csv")

In [5]:
#section the df
df_ST_not0 = df[df['SkinThickness'] != 0]
df_ST_is0 = df[df['SkinThickness'] == 0]

In [8]:
df_ST_is0.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
2,8,183,64,0,0,23.3,0.672,32,1
5,5,116,74,0,0,25.6,0.201,30,0
8,4,110,92,0,0,37.6,0.191,30,0
9,10,168,74,0,0,38.0,0.537,34,1
10,10,139,80,0,0,27.1,1.441,57,0


In [7]:
df_ST_not0.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
6,3,78,50,32,88,31.0,0.248,26,1


In [11]:
#segregate training dataset
X = df_ST_not0.drop(columns=['SkinThickness'])
y = df_ST_not0['SkinThickness']

X_train, X_temp, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
X_test, X_val, y_test, y_val = train_test_split (X, y, test_size=0.5, random_state=42)

In [12]:
model = RandomForestRegressor(random_state = 42)

In [13]:
model.fit(X_train, y_train)

In [15]:
y_pred_val = model.predict(X_val)
print('Validation MSE:', mean_squared_error(y_val, y_pred_val))

Validation MSE: 56.49761654135339


In [16]:
#tune the model first
from sklearn.model_selection import GridSearchCV

In [17]:
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [None, 5, 10, 15],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}

# Initialize the Random Forest Regressor
rf_regressor = RandomForestRegressor(random_state=42)

# Perform Grid Search for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the Grid Search to the non-zero data
X = df_ST_not0.drop(columns=['SkinThickness'])
y = df_ST_not0['SkinThickness']
grid_search.fit(X, y)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score (MSE):", best_score)

# Train a new model with the best parameters
best_model = RandomForestRegressor(**best_params, random_state=42)
best_model.fit(X, y)


720 fits failed out of a total of 2160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
720 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
s

Best Parameters: {'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 400}
Best Score (MSE): 63.43300855451614


In [18]:
X_zero = df_ST_is0.drop(columns=['SkinThickness'])
predicted_values = best_model.predict(X_zero)

# Add the predicted values back to the zero data points
df_ST_is0['SkinThickness'] = predicted_values

# Combine the non-zero data with the predicted zero data
final_data = pd.concat([df_ST_not0, df_ST_is0 ], ignore_index=True)

# Output the final dataset
print(final_data.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72           35.0        0  33.6   
1            1       85             66           29.0        0  26.6   
2            1       89             66           23.0       94  28.1   
3            0      137             40           35.0      168  43.1   
4            3       78             50           32.0       88  31.0   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.167   21        0  
3                     2.288   33        1  
4                     0.248   26        1  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ST_is0['SkinThickness'] = predicted_values


In [19]:
final_data.head(100)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35.0,0,33.6,0.627,50,1
1,1,85,66,29.0,0,26.6,0.351,31,0
2,1,89,66,23.0,94,28.1,0.167,21,0
3,0,137,40,35.0,168,43.1,2.288,33,1
4,3,78,50,32.0,88,31.0,0.248,26,1
...,...,...,...,...,...,...,...,...,...
95,4,154,62,31.0,284,32.8,0.237,23,0
96,9,57,80,37.0,0,32.8,0.096,41,0
97,2,106,64,35.0,119,30.5,1.400,34,0
98,2,90,70,17.0,0,27.3,0.085,22,0


In [20]:
output_file = 'final_data.csv'

In [21]:
final_data.to_csv(output_file, index=False)