In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt



import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("users_data.csv")
df

Unnamed: 0,user_id,loan_score,device_rating,data_quality,ltv_rate,bureau_score,total_tenure,months_active,savings_score,tx_score,usage_score,airtime_score,cluster
0,146245,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,
1,248802,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,
2,13873,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,
3,35230,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,
4,653322,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,484287,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,
1595,395421,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,
1596,341145,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,
1597,587504,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,


In [3]:
features = ['loan_score', 'device_rating', 'data_quality', 'ltv_rate',
       'bureau_score', 'total_tenure', 'months_active', 'savings_score',
       'tx_score', 'usage_score', 'airtime_score']

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   user_id        1599 non-null   int64  
 1   loan_score     1599 non-null   float64
 2   device_rating  1599 non-null   float64
 3   data_quality   1599 non-null   float64
 4   ltv_rate       1599 non-null   float64
 5   bureau_score   1599 non-null   float64
 6   total_tenure   1599 non-null   float64
 7   months_active  1599 non-null   float64
 8   savings_score  1599 non-null   float64
 9   tx_score       1599 non-null   float64
 10  usage_score    1599 non-null   float64
 11  airtime_score  1599 non-null   float64
 12  cluster        0 non-null      float64
dtypes: float64(12), int64(1)
memory usage: 162.5 KB


In [5]:
df.nunique()

user_id          1599
loan_score         96
device_rating     143
data_quality       80
ltv_rate           91
bureau_score      153
total_tenure       60
months_active     144
savings_score     436
tx_score           89
usage_score        96
airtime_score      65
cluster             0
dtype: int64

In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_id,1599.0,397669.085679,228348.459331,1449.0,196569.5,398366.0,592184.0,798882.0
loan_score,1599.0,8.319637,1.741096,4.6,7.1,7.9,9.2,15.9
device_rating,1599.0,0.527821,0.17906,0.12,0.39,0.52,0.64,1.58
data_quality,1599.0,0.270976,0.194801,0.0,0.09,0.26,0.42,1.0
ltv_rate,1599.0,2.538806,1.409928,0.9,1.9,2.2,2.6,15.5
bureau_score,1599.0,0.087467,0.047065,0.012,0.07,0.079,0.09,0.611
total_tenure,1599.0,15.874922,10.460157,1.0,7.0,14.0,21.0,72.0
months_active,1599.0,46.467792,32.895324,6.0,22.0,38.0,62.0,289.0
savings_score,1599.0,0.996747,0.001887,0.99007,0.9956,0.99675,0.997835,1.00369
tx_score,1599.0,3.311113,0.154386,2.74,3.21,3.31,3.4,4.01


In [7]:
#drop irrelevant columns as it is not useful for the clustering
X=df.drop(columns=['user_id', 'cluster'])

In [8]:
#standardize the feautres to be in a uniform values
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)

In [9]:
#set the number if clusters to 6
kmeans=KMeans(n_clusters=6, random_state=42)
df['cluster']= kmeans.fit_predict(scaled_X)

In [11]:
# here we are going to evaluate the quality of the clusters using silhouette score to measure how seperate our clusters are
# calculating the silhouette score
silh = silhouette_score(scaled_X, df['cluster'])
print(f"The Silhousette score of the cluster is : {silh}")

The Silhousette score of the cluster is : 0.18081259098537378


In [13]:
# the Clusters
cluster_centers = pd.DataFrame(kmeans.cluster_centers_, columns=features)
print("Cluster Centers:")
print(cluster_centers)

Cluster Centers:
   loan_score  device_rating  data_quality  ltv_rate  bureau_score  \
0   -0.415127       0.620885     -0.756322 -0.225428     -0.044458   
1    1.578385      -0.614113      1.208793  0.249832      0.032147   
2   -0.107715       0.057658      0.071789  0.424316      0.006072   
3    0.095416       0.002200      1.181553 -0.389872      5.784760   
4   -1.113904       0.429280     -0.944750 -0.243316     -0.417811   
5    0.040595      -1.037823      0.706747 -0.175414     -0.294195   

   total_tenure  months_active  savings_score  tx_score  usage_score  \
0     -0.457744      -0.374704       0.005516  0.298760    -0.394490   
1     -0.500180      -0.454572       1.082951 -0.947047     0.334160   
2      1.126384       1.352654       0.328477 -0.115392    -0.177430   
3     -0.049516       0.510330       0.180072 -1.735792     3.663412   
4      0.300836      -0.152247      -1.336730  1.226872    -0.136761   
5     -0.249914      -0.506414      -0.725969 -0.131331     

Each row in the above cluster_centers DataFrame represents the centroid of one of the 6 clusters. These centroids represent the "average" user profile for each cluster based on the features:

In [14]:
#displaying the initial dataframe with the assigned clusters
df

Unnamed: 0,user_id,loan_score,device_rating,data_quality,ltv_rate,bureau_score,total_tenure,months_active,savings_score,tx_score,usage_score,airtime_score,cluster
0,146245,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0
1,248802,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,0
2,13873,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,0
3,35230,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,1
4,653322,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,484287,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,4
1595,395421,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,4
1596,341145,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,4
1597,587504,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,4


Here is an explanation of the 6 clusters we have in our dataframe:

1. Cluster 0: Poor loan score and data quality but high device rating and good transactions.
2. Cluster 1: High loan score and data quality but poor device ratings and low transactional engagement.
3. Cluster 2: Average loan scores that are long in tenure but below average on usage and airtime.
4. Cluster 3: High scores on bureau and usage, but risky on account of poor transaction and LTV performance.
5. Cluster 4: Weak in finance, high airtime engagements due to low loan scores and savings.
6. Cluster 5: Poor device and savings scores, high airtime usages, and high engagement.

Each cluster highlights a different kind of profile of the user based on financial activity and engagement metrics.

In [17]:
common_cluster=df.groupby("cluster").agg(list)


In [30]:
common_cluster

Unnamed: 0_level_0,user_id,loan_score,device_rating,data_quality,ltv_rate,bureau_score,total_tenure,months_active,savings_score,tx_score,usage_score,airtime_score
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,"[146245, 248802, 13873, 653322, 61550, 399240,...","[7.4, 7.8, 7.8, 7.4, 7.4, 7.9, 7.3, 7.8, 6.7, ...","[0.7, 0.88, 0.76, 0.7, 0.66, 0.6, 0.65, 0.58, ...","[0.0, 0.0, 0.04, 0.0, 0.0, 0.06, 0.0, 0.02, 0....","[1.9, 2.6, 2.3, 1.9, 1.8, 1.6, 1.2, 2.0, 1.8, ...","[0.076, 0.098, 0.092, 0.076, 0.075, 0.069, 0.0...","[11.0, 25.0, 15.0, 11.0, 13.0, 15.0, 15.0, 9.0...","[34.0, 67.0, 54.0, 34.0, 40.0, 59.0, 21.0, 18....","[0.9978, 0.9968, 0.997, 0.9978, 0.9978, 0.9964...","[3.51, 3.2, 3.26, 3.51, 3.51, 3.3, 3.39, 3.36,...","[0.56, 0.68, 0.65, 0.56, 0.56, 0.46, 0.47, 0.5...","[9.4, 9.8, 9.8, 9.4, 9.4, 9.4, 10.0, 9.5, 9.2,..."
1,"[35230, 639023, 252019, 720793, 658362, 781226...","[11.2, 7.8, 8.7, 10.2, 8.8, 8.8, 10.1, 9.4, 11...","[0.28, 0.61, 0.29, 0.42, 0.41, 0.41, 0.31, 0.4...","[0.56, 0.29, 0.52, 0.57, 0.64, 0.64, 0.44, 0.3...","[1.9, 1.6, 1.6, 3.4, 2.2, 2.2, 2.3, 2.2, 2.0, ...","[0.075, 0.114, 0.113, 0.07, 0.093, 0.093, 0.08...","[17.0, 9.0, 12.0, 4.0, 9.0, 9.0, 22.0, 13.0, 1...","[60.0, 29.0, 37.0, 10.0, 42.0, 42.0, 46.0, 62....","[0.998, 0.9974, 0.9969, 0.9971, 0.9986, 0.9986...","[3.16, 3.26, 3.25, 3.04, 3.54, 3.54, 3.32, 3.0...","[0.58, 1.56, 0.58, 0.63, 0.66, 0.66, 0.67, 0.6...","[9.8, 9.1, 9.5, 9.6, 10.5, 10.5, 9.7, 10.5, 10..."
2,"[59826, 473056, 25411, 314003, 428118, 432850,...","[7.5, 7.5, 8.9, 8.9, 8.5, 8.9, 7.6, 8.3, 6.9, ...","[0.5, 0.5, 0.62, 0.62, 0.28, 0.22, 0.39, 0.655...","[0.36, 0.36, 0.18, 0.19, 0.56, 0.48, 0.31, 0.1...","[6.1, 6.1, 3.8, 3.9, 1.8, 1.8, 2.3, 2.3, 10.7,...","[0.071, 0.071, 0.176, 0.17, 0.092, 0.077, 0.08...","[17.0, 17.0, 52.0, 51.0, 35.0, 29.0, 23.0, 15....","[102.0, 102.0, 145.0, 148.0, 103.0, 60.0, 71.0...","[0.9978, 0.9978, 0.9986, 0.9986, 0.9969, 0.996...","[3.35, 3.35, 3.16, 3.17, 3.3, 3.39, 3.52, 3.17...","[0.8, 0.8, 0.88, 0.93, 0.75, 0.53, 0.65, 0.66,...","[10.5, 10.5, 9.2, 9.2, 10.5, 9.4, 9.7, 9.8, 9...."
3,"[43332, 195504, 774368, 80480, 45869, 606217, ...","[8.1, 7.9, 7.5, 7.8, 7.3, 8.6, 8.6, 8.6, 7.8, ...","[0.56, 0.32, 0.49, 0.43, 0.67, 0.49, 0.49, 0.4...","[0.28, 0.51, 0.2, 0.7, 0.26, 0.28, 0.28, 0.29,...","[1.7, 1.8, 2.6, 1.9, 1.8, 1.9, 1.9, 2.0, 1.7, ...","[0.368, 0.341, 0.332, 0.464, 0.401, 0.11, 0.11...","[16.0, 17.0, 8.0, 22.0, 16.0, 20.0, 20.0, 19.0...","[56.0, 56.0, 14.0, 67.0, 51.0, 136.0, 136.0, 1...","[0.9968, 0.9969, 0.9968, 0.9974, 0.9969, 0.997...","[3.11, 3.04, 3.21, 3.13, 3.16, 2.93, 2.93, 2.9...","[1.28, 1.08, 0.9, 1.28, 1.14, 1.95, 1.95, 1.98...","[9.3, 9.2, 10.5, 9.4, 9.4, 9.9, 9.9, 9.8, 9.3,..."
4,"[670733, 742835, 112605, 382320, 131484, 68553...","[5.6, 4.6, 5.0, 4.7, 5.6, 5.6, 5.2, 5.2, 5.8, ...","[0.615, 0.52, 1.02, 0.6, 0.5, 0.5, 0.34, 0.34,...","[0.0, 0.15, 0.04, 0.17, 0.09, 0.09, 0.0, 0.0, ...","[1.6, 2.1, 1.4, 2.3, 2.3, 2.3, 1.8, 1.8, 1.8, ...","[0.089, 0.054, 0.045, 0.058, 0.049, 0.049, 0.0...","[16.0, 8.0, 41.0, 17.0, 17.0, 17.0, 27.0, 27.0...","[59.0, 65.0, 85.0, 106.0, 99.0, 99.0, 63.0, 63...","[0.9943, 0.9934, 0.9938, 0.9932, 0.9937, 0.993...","[3.58, 3.9, 3.75, 3.85, 3.63, 3.63, 3.68, 3.68...","[0.52, 0.56, 0.48, 0.6, 0.63, 0.63, 0.79, 0.79...","[9.9, 13.1, 10.5, 12.9, 13.0, 13.0, 14.0, 14.0..."
5,"[747176, 700890, 655335, 507808, 15636, 431598...","[8.1, 9.3, 6.3, 8.2, 7.3, 9.6, 9.7, 7.9, 7.1, ...","[0.38, 0.32, 0.3, 0.4, 0.33, 0.32, 0.53, 0.35,...","[0.28, 0.57, 0.48, 0.44, 0.47, 0.47, 0.6, 0.46...","[2.1, 2.0, 1.8, 2.8, 2.1, 1.4, 2.0, 3.6, 2.5, ...","[0.066, 0.074, 0.069, 0.089, 0.077, 0.056, 0.0...","[13.0, 27.0, 18.0, 11.0, 5.0, 9.0, 5.0, 15.0, ...","[30.0, 65.0, 61.0, 43.0, 11.0, 24.0, 19.0, 37....","[0.9968, 0.9969, 0.9959, 0.9975, 0.9958, 0.996...","[3.23, 3.28, 3.44, 3.53, 3.33, 3.22, 3.3, 3.35...","[0.73, 0.79, 0.78, 0.61, 0.53, 0.82, 0.86, 0.8...","[9.7, 10.7, 10.3, 10.5, 10.3, 10.3, 12.4, 12.8..."
