In [3]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
sns.set(style='white', palette='muted',color_codes=True)
import matplotlib.pyplot as plt 
plt.style.use('ggplot')
import plotly.express as px

In [4]:
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler,LabelEncoder,OrdinalEncoder,OneHotEncoder,MinMaxScaler 
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression,LinearRegression 
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor 
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor 
from sklearn.impute import SimpleImputer 
from sklearn.metrics import r2_score, accuracy_score, mean_squared_error, confusion_matrix,recall_score,silhouette_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [5]:
data = pd.read_csv(r'c:\Github\Fullstack-Data-Analyst\Learning\the_data\data-lab-5-hcv.csv')
data.head(1)
# data.info()

Unnamed: 0.1,Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,1,0=Blood Donor,32,m,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0


In [6]:
data.duplicated().sum()
mis_col = data.isna().sum() * 100 /len(data)
mis_col.sort_values(ascending=False)

ALP           2.926829
CHOL          1.626016
ALB           0.162602
ALT           0.162602
PROT          0.162602
Unnamed: 0    0.000000
Category      0.000000
Age           0.000000
Sex           0.000000
AST           0.000000
BIL           0.000000
CHE           0.000000
CREA          0.000000
GGT           0.000000
dtype: float64

In [7]:
impu = SimpleImputer(strategy='mean',missing_values=np.NaN)

miss_val = ['ALB', 'ALP', 'ALT', 'CHOL', 'PROT']

data[miss_val] = impu.fit_transform(data[miss_val])

In [8]:
data.duplicated().sum()
mis_col = data.isna().sum() * 100 /len(data)
mis_col.sort_values(ascending=False)

Unnamed: 0    0.0
Category      0.0
Age           0.0
Sex           0.0
ALB           0.0
ALP           0.0
ALT           0.0
AST           0.0
BIL           0.0
CHE           0.0
CHOL          0.0
CREA          0.0
GGT           0.0
PROT          0.0
dtype: float64

In [9]:
# data = data.drop(columns=['Unnamed: 0'],axis=1)
data.drop(columns=['Unnamed: 0'],axis=1,inplace=True)

In [10]:
# orig_val_cate = data.Category

# cat_val = ['Category', 'Sex']


# le = LabelEncoder()
# # data[cat_val] = data[cat_val].apply(le.fit_transform)

# le_val_cate = data.Category

# orig_dic_cate = np.unique(orig_val_cate)
# le_dic_cate = np.unique(le_val_cate)

# cate_dictionary = dict(zip(orig_dic_cate,le_dic_cate))
# cate_dictionary

In [11]:
fig = px.scatter_matrix(data, dimensions=['CHE', 'CHOL'], color='Category')
fig.show()

# Build a K-means Model

In [12]:
y = data.Category 
le = LabelEncoder()

true_labels = le.fit_transform(y)
data = pd.get_dummies(data,dtype=float)
# data


In [13]:
le.classes_ 

n_clusters = len(le.classes_)

preprocessor = Pipeline([ 
    ('scaler', MinMaxScaler()),
    ('pca',PCA(n_components=2,random_state=42))
])

In [14]:
clusterer = Pipeline( 
    [ 
        ( 
            'kmeans',
            KMeans( 
                n_clusters=n_clusters, 
                init = 'k-means++',
                n_init = 50,
                max_iter = 500,
                random_state=42,
            )
        )
    ]
)

pipe = Pipeline( 
    [ 
        ('preprocessor',preprocessor),
        ('clusterer',clusterer)
    ]
)

pipe.fit(data)