In [None]:
import pandas as pd
import numpy as np

# Load the CSV dataset (use raw string or replace backslashes with forward slashes)
df = pd.read_csv(r"C:\Users\Tamim\Desktop\Semester 6\Data Mining\project\diabetes.csv")

# Replacing 0 values in columns where 0 is not a valid measurement
df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.nan)

# Imputing missing values with median
df.fillna(df.median(), inplace=True)

# Checking for any remaining missing values
print(df.isnull().sum())


In [None]:
from sklearn.preprocessing import StandardScaler

# Scaling the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop(columns=['Outcome']))

# Convert the scaled data back to a DataFrame
scaled_df = pd.DataFrame(scaled_features, columns=df.columns[:-1])


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Histogram for the features
df.hist(figsize=(10, 8))
plt.suptitle("Distribution of Features")
plt.show()

# Box plots to check for outliers
plt.figure(figsize=(10, 6))
sns.boxplot(data=df)
plt.title("Boxplot of Features to Identify Outliers")
plt.show()

# Correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()


In [None]:
# training and testing the dataset
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Define features (X) and target variable (y)
X = scaled_df  # Using the scaled features from the previous step
y = df['Outcome']  # The target variable (assumed to be 'Outcome' in your dataset)

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Logistic Regression model
model = LogisticRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)


In [None]:
# model performance
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Evaluate the model using accuracy
accuracy = accuracy_score(y_test, y_pred)

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Generate a detailed classification report (includes precision, recall, F1-score)
class_report = classification_report(y_test, y_pred)

# Display the results
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Initialize the Logistic Regression model
model = LogisticRegression()

# Perform 5-Fold Cross-Validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

# Display the individual fold scores and their average
print("Cross-Validation Scores (Accuracy for each fold):", cv_scores)
print("Mean Accuracy:", np.mean(cv_scores))
print("Standard Deviation of Accuracy:", np.std(cv_scores))
