# Logistic regression model for Diabetes dataset

In [None]:
# Problem Statement :
# This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. 
# The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, 
# based on certain diagnostic measurements included in the dataset. 
# Several constraints were placed on the selection of these instances from a larger database. 
# In particular, all patients here are females at least 21 years old of Pima Indian heritage.

## Importing libraries and dataset

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
df = pd.read_csv(r'C:\Users\visha\OneDrive\Desktop\New folder\verzeo\diabetes.csv')
df.head()  

## Exploring the data

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

# Correlation

In [None]:
df.corr()

In [None]:
corr_vis = sns.heatmap(df.corr(),cmap="Reds",annot=False)

# Assigning the Independent variables as X and Dependent variable as Y

In [None]:
x = df[["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]].values
x

In [None]:
y = df['Outcome'].values # 1 = Positive (the patient has diabetes), 2 = Negative (the patient does not have diabetes)
y

## Splitting into training and test data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.25, random_state = 0)

## Standardizing x variables

In [None]:
ss = StandardScaler()

In [None]:
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)

In [None]:
x_test

## Logistics regression

In [None]:
classifier = LogisticRegression()

In [None]:
classifier.fit(x_train, y_train)

In [None]:
y_pred = classifier.predict(x_test)
y_pred

## Model accuracy

In [None]:
accuracy = accuracy_score(y_test, y_pred)
accuracy*100

## Confusion matrix

In [None]:
len(y_test)

In [None]:
confusion_matrix(y_test, y_pred)

## Prediction Model

In [None]:
data = {'Pregnancies': [0,1,2,3], 'Glucose': [170, 165, 160, 180], 'BloodPressure': [120, 125, 115, 1130], 'SkinThickness': [55, 60, 50, 58], 'Insulin': [36, 40, 30, 45], 'BMI': [32.1, 30.1, 35.5, 29.2], 'DiabetesPedigreeFunction': [0.65, 0.60, 0.80, 0.50], 'Age': [21, 23, 21, 25]}  
women_data = pd.DataFrame(data)
women_data.head()

In [None]:
Outcome = classifier.predict(women_data)
print(Outcome)

In [None]:
women_data['Outcome'] = Outcome
women_data