In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'diabetes-dataset-for-beginners:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1956887%2F3226898%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240724%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240724T232335Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D817aeb3bce1756667892ee74d1122f89985ffa73b07a4c45b7ba42e69b175d2a91470dba940a4b79b9b38411b2a96fe206a73d3f31c6e9fdf636aaee3a97cc1594b1a4e6f3e867ac31d7b953cca94f8374db0547bf4053e6740bfbb2dff22c0669777f8208d967dad60d6f06af109904708a248daa97abf4204ddf3a176d1902d1103b615fb323cc8cf95b82b4b2f37ee22ababb9ceff5e123631076023fbcbde73fe6dbf7ab3aa08d52e6cd09cff59a875dfad8e7899b8ce2a11e518b6a373846dd6b3fdff0c27dfa38fe31f93ae91e340b4fa091bf5af54f5c0966a29983fb0b165c2592e2525898809fd6ddc69b885a20dc77976fa3294d800889f3b78aeb'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading the Dataset

In [None]:
# Load the diabetes dataset from the provided CSV file
data = pd.read_csv('/kaggle/input/diabetes-dataset-for-beginners/diabetes.csv')
# Display the first few rows of the dataset
data.head()

# Dataset Information

In [None]:
# Display basic information about the dataset, including data types and non-null counts
data.info()

# Descriptive Statistics

In [None]:
# Generate descriptive statistics for each column in the dataset
data.describe()

# Check for Duplicates

In [None]:
# Check for duplicate rows in the dataset
data.duplicated().sum()

# Correlation Heatmap

In [None]:
# Compute and visualize the correlation matrix
corr_matrix = data.corr()
sns.heatmap(corr_matrix, annot=True)
plt.show()

# Histograms of Features that have zero values

In [None]:
# List of columns to visualize through histograms
columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

# Plot histograms for the selected columns
plt.figure(figsize=(15, 12))
for i, column in enumerate(columns, 1):
    plt.subplot(3, 2, i)
    sns.histplot(data[column], kde=True)
    plt.title(f'Distribution of {column}')
plt.tight_layout()
plt.show()

# Handling Zero Values

In [None]:
# Replace zero values in selected columns with NaN
data[columns] = data[columns].replace(0, np.nan)

# Impute missing values with mean or median based on the distribution
data['Glucose'] = data['Glucose'].fillna(data['Glucose'].mean())
data['BloodPressure'] = data['BloodPressure'].fillna(data['BloodPressure'].mean())
data['SkinThickness'] = data['SkinThickness'].fillna(data['SkinThickness'].median())
data['Insulin'] = data['Insulin'].fillna(data['Insulin'].median())
data['BMI'] = data['BMI'].fillna(data['BMI'].mean())

# Drop Unnecessary Columns

In [None]:
# Drop columns that are not required for the analysis
data = data.drop(['BloodPressure', 'Insulin', 'DiabetesPedigreeFunction'], axis='columns')
data.head()

# Box Plots of Selected Features

In [None]:
# Plot box plots for selected columns to visualize the distribution and detect outliers
plt.figure(figsize=(15, 10))
columns_to_plot = [col for col in columns if col in data.columns]

for i, column in enumerate(columns_to_plot, 1):
    plt.subplot(3, 2, i)
    sns.boxplot(x=data[column])
    plt.title(f'Box Plot of {column}')
plt.tight_layout()
plt.show()

# Prepare Data for Modeling

In [None]:
# Define target variable and feature set
y = data["Outcome"]
x = data.drop(["Outcome"], axis=1)

# Check Target Variable

In [None]:
# Display the target variable (Outcome)
y


# Feature Scaling

In [None]:
# Standardize features by scaling to have mean=0 and variance=1
scale = preprocessing.StandardScaler()
x = scale.fit_transform(x)
x

# Hyperparameter Tuning with Random State

*This function is designed to find the optimal random state for splitting the dataset, aiming to maximize the model's accuracy. It does so by iterating over a range of random states, training the model for each state, and evaluating its performance. The random state that results in the highest accuracy is recorded and printed.*

In [None]:
# Test different random states to find the best one for model accuracy
random_states = range(0, 100)  # Adjust the range as needed

best_accuracy = 0
best_random_state = None

for state in random_states:
    train_x, test_x, train_y, test_y = train_test_split(x, y, random_state=state, test_size=0.2)

    classifier = svm.SVC(kernel='linear', degree=3)
    classifier.fit(train_x, train_y)
    predictions = classifier.predict(test_x)

    accuracy = accuracy_score(test_y, predictions)

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_random_state = state

print(f"Best random state: {best_random_state}, Best accuracy: {best_accuracy*100:.1f}%")

# Preparing Input Data for Prediction

In [None]:
# Define an example input for prediction
input_data = (2, 120, 35, 28.7, 45)
input_data_arr = np.asarray(input_data)
input_data_arr.shape

# Reshape Input Data

In [None]:
# Reshape the input data to match the model's expected input shape
input_data_reshaped = input_data_arr.reshape(1, -1)
input_data_reshaped.shape

# Make Prediction

In [None]:
# Use the trained model to make a prediction on the input data
model_pred = classifier.predict(input_data_reshaped)

if model_pred == [0]:
    print("The person doesn't have diabetes")
else:
    print("The person has diabetes")