In [None]:
import pandas as pd
df = pd.read_csv("/content/diabetes_prediction_dataset.csv")
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [None]:
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


Unnamed: 0,0
gender,0
age,0
hypertension,0
heart_disease,0
smoking_history,0
bmi,0
HbA1c_level,0
blood_glucose_level,0
diabetes,0


In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df['smoking_history'] = le.fit_transform(df['smoking_history'])
df['smoking_history']=le.fit_transform(df['smoking_history'])

In [None]:
features = ['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history', 'bmi', 'HbA1c_level', 'blood_glucose_level']
target = 'diabetes'
X = df[features].values
y = df[target].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)
y_pred_log_reg = log_reg.predict(X_test_scaled)
print("Logistic Regression Accuracy:", metrics.accuracy_score(y_test, y_pred_log_reg))

Logistic Regression Accuracy: 0.96055


In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train_scaled, y_train)
y_pred_rf = rf_clf.predict(X_test_scaled)
print("Random Forest Accuracy:", metrics.accuracy_score(y_test, y_pred_rf))

Random Forest Accuracy: 0.97055


In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X_train_scaled)
y_pred_kmeans = kmeans.predict(X_test_scaled)
print("KMeans Clustering Accuracy:", metrics.accuracy_score(y_test, y_pred_kmeans))

KMeans Clustering Accuracy: 0.41935


In [None]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=2, random_state=42)
gmm.fit(X_train_scaled)
y_pred_gmm = gmm.predict(X_test_scaled)
print("GMM Clustering Accuracy:", metrics.accuracy_score(y_test, y_pred_gmm))

GMM Clustering Accuracy: 0.1321


In [None]:
from sklearn.metrics import classification_report
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_log_reg))
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))
print("KMeans Clustering Classification Report:\n", classification_report(y_test, y_pred_kmeans))
print("GMM Clustering Classification Report:\n", classification_report(y_test, y_pred_gmm))

Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98     18297
           1       0.86      0.64      0.74      1703

    accuracy                           0.96     20000
   macro avg       0.91      0.82      0.86     20000
weighted avg       0.96      0.96      0.96     20000

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98     18297
           1       0.95      0.69      0.80      1703

    accuracy                           0.97     20000
   macro avg       0.96      0.84      0.89     20000
weighted avg       0.97      0.97      0.97     20000

KMeans Clustering Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.46      0.59     18297
           1       0.01      0.03      0.01      1703

    accuracy                           0.42     20000
   m

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer, StandardScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col, log, mean
spark = SparkSession.builder.appName("DiabetesPrediction").getOrCreate()
df = spark.read.csv("/content/diabetes_prediction_dataset.csv", header=True, inferSchema=True)
df = df.withColumn("bmi_log", log(col("bmi") + 1))
df = df.withColumn("blood_glucose_level_log", log(col("blood_glucose_level") + 1))
means = {col: df.select(mean(col)).first()[0] for col in df.columns if df.schema[col].dataType != "string"}
means = {key: value for key, value in means.items() if value is not None}
df = df.fillna(means)
modes = {
    'gender': df.groupBy('gender').count().orderBy('count', ascending=False).first()['gender'],
    'smoking_history': df.groupBy('smoking_history').count().orderBy('count', ascending=False).first()['smoking_history']
}
df = df.fillna(modes)
features = ['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history', 'bmi_log', 'HbA1c_level', 'blood_glucose_level_log']
label = 'diabetes'
label_indexer = StringIndexer(inputCol=label, outputCol=label + "_index", handleInvalid="keep")
indexers = [
    StringIndexer(inputCol='gender', outputCol='gender_index', handleInvalid="keep"),
    StringIndexer(inputCol='smoking_history', outputCol='smoking_history_index', handleInvalid="keep")
]
assembler = VectorAssembler(inputCols=['gender_index', 'smoking_history_index', 'age', 'hypertension', 'heart_disease', 'bmi_log', 'HbA1c_level', 'blood_glucose_level_log'], outputCol="features")
train_data, test_data = df.randomSplit([0.8, 0.2], seed=1234)
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
rf = RandomForestClassifier(labelCol=label + "_index", featuresCol="scaled_features")
pipeline = Pipeline(stages=indexers + [label_indexer, assembler, scaler, rf])
model = pipeline.fit(train_data)
predictions = model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol=label + "_index", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy of the diabetes prediction model:", accuracy)
predictions.select("features", "scaled_features", "prediction", label + "_index").show(5)


Accuracy of the diabetes prediction model: 0.9727510829052081
+--------------------+--------------------+----------+--------------+
|            features|     scaled_features|prediction|diabetes_index|
+--------------------+--------------------+----------+--------------+
|(8,[2,5,6,7],[0.0...|(8,[2,5,6,7],[0.0...|       0.0|           0.0|
|(8,[2,5,6,7],[0.0...|(8,[2,5,6,7],[0.0...|       0.0|           0.0|
|(8,[2,5,6,7],[0.1...|(8,[2,5,6,7],[0.0...|       0.0|           0.0|
|(8,[2,5,6,7],[0.1...|(8,[2,5,6,7],[0.0...|       0.0|           0.0|
|(8,[2,5,6,7],[0.1...|(8,[2,5,6,7],[0.0...|       0.0|           0.0|
+--------------------+--------------------+----------+--------------+
only showing top 5 rows



In [None]:
model = pipeline.fit(train_data)
predictions = model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol=label + "_index", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9727510829052081


In [None]:
import pickle
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_jobs=1)
model.fit(X_train, y_train)
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
!ls

diabetes_prediction_dataset.csv  model.pkl  sample_data
