# Here we are using clustering so that we can group soums (districts) that have similar demograpics, financial behaviours, economic structures and etc.


---
For example, very high-population or high-loan soums can skew models. Clustering lets you analyze them separately in their own segment.


In [3]:
import pandas as pd
merged_df = pd.read_excel("/content/merged_df.xlsx")

In [11]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

categorical_cols = ['City', 'District']
numerical_cols = [
    'Population', 'Total_kids', 'Total_working', 'Total_olders',
    'Percentage_kids', 'Percentage_working', 'Percentage_olders',
    'Total_loan_balance', 'Total_loans_accounts', 'Total_balances',
    'Total_current_accounts', 'Total_deposits', 'Total_deposits_accounts',
    'Golomt_branches', 'Companies', 'Normal', 'Delinquent', 'Overdue',
    'Time_deposits', 'Timeless_deposits', 'Total_banks_branches'
]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('kmeans', KMeans(n_clusters=4, random_state=42, n_init='auto'))
])

pipeline.fit(merged_df)

merged_df['cluster'] = pipeline.predict(merged_df)

In [12]:
# Show all rows that belong to cluster 1 (to check whether the clustering makes sense)
cluster_0_data = merged_df[merged_df['cluster'] == 1]
print(cluster_0_data)


           City      District  Companies  Population  Total_kids  \
16   говь-алтай     есөнбулаг        440       18880        5811   
32       завхан      улиастай        613       16115        4844   
58          увс      улаангом        958       33966       11463   
75         ховд     жаргалант       1284       34532       12050   
93     архангай  эрдэнэбулган        481       21649        6721   
114  баянхонгор    баянхонгор        792       31307       11376   
148  өвөрхангай     арвайхээр        698       37361       12681   
168     хөвсгөл         мөрөн        888       43338       13600   
185   дорноговь      сайншанд        764       28492        9408   
186   дорноговь     замын-үүд        265       18359        6023   
211    өмнөговь       ханбогд        365        9533        2709   
215    өмнөговь     цогтцэций        291       10752        4139   
216    өмнөговь   даланзадгад       1324       32674       11481   
220     сэлэнгэ        мандал        464       2

In [13]:
print(merged_df['cluster'].value_counts())


cluster
0    173
2    115
1     18
3      2
Name: count, dtype: int64
