## import library

In [1]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'matplotlib'

## read main csv file

In [None]:
df = pd.read_csv('diabetic_data.csv')
df.drop(columns='patient_nbr')

## Deal with missing data
 This is because missing values can affect the accuracy of subsequent preprocessing steps. You can choose to either remove the missing data or fill them in using methods such as mean, median, or mode imputation.

In [None]:
df.replace('?', np.nan, inplace=True)
df.replace('None', np.nan, inplace=True)
df.replace('', np.nan, inplace=True)
df.replace(' ', np.nan, inplace=True)

# replace missing values in numerical columns with mean
numerical_cols = df.select_dtypes(include=['float', 'int']).columns
for col in numerical_cols:
    mean = df[col].mean()
    df[col].fillna(mean, inplace=True)

# replace missing values in string columns with mode
string_cols = df.select_dtypes(include=['object']).columns
for col in string_cols:
    mode = df[col].mode()[0]
    df[col].fillna(mode, inplace=True)

# Check for missing values after imputation
missing_values_count = df.isna().sum()
print(missing_values_count)

## Encode categorical variables
 If your dataset contains categorical variables, you should encode them as numerical values before performing any further preprocessing. This is because most machine learning algorithms require numerical inputs.

In [None]:
# identify columns with string values
str_cols = df.select_dtypes(include=['object']).columns

# initialize a LabelEncoder object and a dictionary to store the mappings
le = LabelEncoder()
mapping_dict = {}

# encode each column with string values and store the mappings in the dictionary
for col in str_cols:
    df[col] = le.fit_transform(df[col])
    mapping_dict[col] = dict(zip(le.classes_, le.transform(le.classes_)))

# print the encoded dataframe and the mapping dictionary
display(df)
print(mapping_dict)

## Correlation Coefficient
 The correlation coefficient measures the strength and direction of the linear relationship between two variables. The code selects important columns by calculating their correlation with the "readmitted" column and choosing those with a correlation coefficient greater than a threshold value, which can be used for further analysis or modeling.

In [None]:
# calculate correlation coefficient
corr = df.corr()["readmitted"]

# plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')

# select columns with correlation coefficient greater than a threshold
threshold = 0.05
important_columns = corr[abs(corr) > threshold].index.tolist()

# use important columns for further analysis
df_important = df[important_columns]
df_important_copy =  df_important.copy(deep=True)
display(df_important.head())

columns = df_important.columns.tolist()
print(columns)

## Using The Elobw Method

In [None]:
wcss_list= []  #Initializing the list for the values of WCSS

#Using for loop for iterations from 1 to 10.
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state= 42)
    kmeans.fit(df_important)
    wcss_list.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss_list)
plt.title('The Elobw Method Graph')
plt.xlabel('Number of clusters(k)')
plt.ylabel('wcss_list')


plt.show()

# Implement k-means , get silhouette and visualization

In [None]:
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
# Apply PCA to reduce the dimensionality of the data
pca = PCA(n_components=2)
X_pca = pca.fit_transform(df_important)

# Apply K-means clustering to the transformed data
kmeans = KMeans(n_clusters=3, init='k-means++', random_state= 42)
kmeans.fit(df_important)

y_pred = kmeans.predict(df_important)
silhouette = silhouette_score(df_important, y_pred)

print("Silhouette:" , silhouette)
# Plot the clustering results in the reduced 2D space
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans.labels_)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

# ______________________________________________________________________

# Implement KNN , get accuracy and visualization

### feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X = pd.DataFrame(sc_X.fit_transform(df_important_copy.drop(['readmitted'],axis = 1)),columns=['patient_nbr', 'time_in_hospital', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'diabetesMed'])
y = df_important_copy['readmitted']
print(y.shape, X.shape)

### Test Train Split and Cross Validation methods

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=1/3,random_state=42, stratify=y)

In [None]:
from sklearn.neighbors import KNeighborsClassifier


test_scores = []
train_scores = []

for i in range(1,15):

    knn = KNeighborsClassifier(i)
    knn.fit(X_train,y_train)

    train_scores.append(knn.score(X_train,y_train))
    test_scores.append(knn.score(X_test,y_test))

In [None]:
## score that comes from testing on the same datapoints that were used for training
max_train_score = max(train_scores)
train_scores_ind = [i for i, v in enumerate(train_scores) if v == max_train_score]
print('Max train score {} % and k = {}'.format(max_train_score*100,list(map(lambda x: x+1, train_scores_ind))))

In [None]:
## score that comes from testing on the datapoints that were split in the beginning to be used for testing solely
max_test_score = max(test_scores)
test_scores_ind = [i for i, v in enumerate(test_scores) if v == max_test_score]
print('Max test score {} % and k = {}'.format(max_test_score*100,list(map(lambda x: x+1, test_scores_ind))))

### Result Visualisation

In [None]:
plt.figure(figsize=(12,5))
p = sns.lineplot(x=range(1,15),y=train_scores,marker='*',label='Train Score')
p = sns.lineplot(x=range(1,15),y=test_scores,marker='o',label='Test Score')