# **Reading a CSV File using Pandas**

In [232]:
import pandas as pd

In [233]:
df = pd.read_csv('../input/diabetes-dataset-trial/diabetes.csv')

In [234]:
df.head()

# **Finding missing values in dataset**
https://towardsdatascience.com/data-cleaning-with-python-and-pandas-detecting-missing-values-3e9c6ebcf78b

**Standard Missing Values**

In [235]:
df['Pregnancies']

In [236]:
df['Pregnancies'].isnull()

**Non-Standard Missing Values**

In [237]:
# Making a list of missing value types
missing_values = ["n/a", "na", "--"]
df = pd.read_csv("../input/diabetes-dataset-trial/diabetes.csv", na_values = missing_values)

In [238]:
print(df['Glucose'])
print(df['Glucose'].isnull())

**Unexpected Missing Values**

In [239]:
cnt=0
for row in df['Insulin']:
    try:
        int(row)
        pass
    except ValueError:
        df.loc[cnt, 'Insulin']=np.nan
    cnt+=1

In [240]:
print(df['Insulin'])

**Total missing values for each feature**

In [241]:
print(df.isnull().sum())

**Any missing values?**

In [242]:
print(df.isnull().values.any())

**Total number of missing values**

In [243]:
print(df.isnull().sum().sum())

# **Dealing with Missing Data**

**Filling in missing values with a single value**

In [244]:
# Replace missing values with a number
df['Age'].fillna(0,inplace=True)

**Location based replacement**

In [245]:
df.loc[2,'Age']

In [246]:
df.loc[2,'Age'] = 21

In [247]:
df.loc[2,'Age']

**Replace using median**

In [248]:
median = df['Age'].median()
median

In [249]:
df['Age'].fillna(median, inplace=True)

# **Calculating Correlation betwwen Attributes**
https://machinelearningmastery.com/how-to-use-correlation-to-understand-the-relationship-between-variables/

**Syntax of dataframe.corr()**

**Syntax:** DataFrame.corr(self, method=’pearson’, min_periods=1) 

**Parameters:** 

**method :** 
* pearson: standard correlation coefficient 
* kendall: Kendall Tau correlation coefficient 
* spearman: Spearman rank correlation

**min_periods :** Minimum number of observations required per pair of columns to have a valid result. Currently only available for pearson and spearman correlation 

**Returns:** count :y : DataFrame

In [250]:
df.corr(method='pearson')

# **Heatmap for the data**
https://www.geeksforgeeks.org/ml-matrix-plots-in-seaborn/

Heatmap is a way to show some sort of matrix plot. To use a heatmap the data should be in a matrix form. By matrix we mean that the index name and the column name must match in some way so that the data that we fill inside the cells are relevant.

In [251]:
import seaborn as sns
sns.heatmap(df.corr(method='pearson'))

**Attributes in Heatmap()**

* annot is used to annotate the actual value that belongs to these cells
* cmap is used for the colour mapping you want like coolwarm, plasma, magma etc.
* linecolor is used to set the colour of the lines separating the cells.
* linewidth is used to set the width of the lines separating the cells.


**Pearson Method**

In [252]:
sns.heatmap(df.corr(method='pearson'), annot=True, cmap='magma', linecolor='black', linewidth=1)

**Kendall Method**

In [253]:
sns.heatmap(df.corr(method='kendall'), annot=True, cmap='magma', linecolor='black', linewidth=1)

**Spearman Method**

In [254]:
sns.heatmap(df.corr(method='spearman'), annot=True, cmap='magma', linecolor='black', linewidth=1)

# **Plotting Visual Map for the given Dataset**

In [255]:
sns.pairplot(df)

In [256]:
accuracy=[]

# **Random Forest**

**Import Libraries**

In [257]:
import pandas as pd
import numpy as np

**Importing Dataset**

In [258]:
df=pd.read_csv('../input                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         /diabetes-dataset-trial/diabetes.csv')

In [259]:
df.head()

**Preparing Data for Training**

In [260]:
x=df.iloc[:,0:8].values
y=df.iloc[:,8].values

**Divide data into Training and Testing sets**

In [261]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

**Feature Scaling**

In [262]:
from sklearn.preprocessing import StandardScaler

sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

**Training the Algorithm**

In [263]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=500, random_state=40)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

**Evaluating the Algorithm**

In [264]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print('Confusion Matrix: ')
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

In [265]:
accuracy.append(accuracy_score(y_test, y_pred))

# **Logistic Regression**

**Import Packages, Functions, and Classes**

In [266]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

**Importing Dataset**

In [267]:
df=pd.read_csv('../input/diabetes-dataset-trial/diabetes.csv')

**Preparing Data for Training**

In [268]:
x=df.iloc[:,0:8].values
y=df.iloc[:,8].values

**Divide data into Training and Testing sets**

In [269]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

**Training the Algorithm**

In [270]:
model = LogisticRegression(solver='liblinear', random_state=42)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)

**Evaluating Algorithm**

In [271]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print('Confusion Matrix: ')
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

In [272]:
accuracy.append(accuracy_score(y_test, y_pred))

# **SVM**

In [273]:
df=pd.read_csv('../input/diabetes-dataset-trial/diabetes.csv')

In [274]:
x=df.iloc[:,0:8].values
y=df.iloc[:,8].values

In [275]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [276]:
from sklearn.svm import SVC
model = SVC(kernel='rbf', random_state = 1)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)

In [277]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print('Confusion Matrix: ')
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

In [278]:
accuracy.append(accuracy_score(y_test, y_pred))

# **Decision Trees**

In [279]:
df=pd.read_csv('../input/diabetes-dataset-trial/diabetes.csv')

In [280]:
x=df.iloc[:,0:8].values
y=df.iloc[:,8].values

In [281]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [282]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)

In [283]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print('Confusion Matrix: ')
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

In [284]:
accuracy.append(accuracy_score(y_test, y_pred))

# **Naive Bayes Classifier**

In [285]:
df=pd.read_csv('../input/diabetes-dataset-trial/diabetes.csv')

In [286]:
x=df.iloc[:,0:8].values
y=df.iloc[:,8].values

In [287]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [288]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)

In [289]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print('Confusion Matrix: ')
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

In [290]:
accuracy.append(accuracy_score(y_test, y_pred))

# **Visualizing the Scores**

In [291]:
import matplotlib.pyplot as plt

In [293]:
Algorithm=['RF','LR','SVM','DT','NBC']

In [295]:
plt.bar(Algorithm, accuracy)