In [1]:
#Importing libraries 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:
#Loading Data 
df=pd.read_csv("../Internship/Project_datasets/userbehaviour.csv")

In [3]:
df.head(10)

Unnamed: 0,userid,Average Screen Time,Average Spent on App (INR),Left Review,Ratings,New Password Request,Last Visited Minutes,Status
0,1001,17.0,634.0,1,9,7,2990,Installed
1,1002,0.0,54.0,0,4,8,24008,Uninstalled
2,1003,37.0,207.0,0,8,5,971,Installed
3,1004,32.0,445.0,1,6,2,799,Installed
4,1005,45.0,427.0,1,5,6,3668,Installed
5,1006,28.0,599.0,0,9,4,2878,Installed
6,1007,49.0,887.0,1,9,6,4481,Installed
7,1008,8.0,31.0,0,2,1,1715,Installed
8,1009,28.0,741.0,1,8,2,801,Installed
9,1010,28.0,524.0,1,8,4,4621,Installed


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   userid                      999 non-null    int64  
 1   Average Screen Time         999 non-null    float64
 2   Average Spent on App (INR)  999 non-null    float64
 3   Left Review                 999 non-null    int64  
 4   Ratings                     999 non-null    int64  
 5   New Password Request        999 non-null    int64  
 6   Last Visited Minutes        999 non-null    int64  
 7   Status                      999 non-null    object 
dtypes: float64(2), int64(5), object(1)
memory usage: 62.6+ KB


# Cleaning of data 

In [5]:
df.duplicated().value_counts()

False    999
dtype: int64

In [6]:
df.isnull().sum()

userid                        0
Average Screen Time           0
Average Spent on App (INR)    0
Left Review                   0
Ratings                       0
New Password Request          0
Last Visited Minutes          0
Status                        0
dtype: int64

# Data Exploring 

In [7]:
df['Status'] = df['Status'].map({'Installed':0,'Uninstalled':1})

In [8]:
df[['Average Screen Time','Status']].value_counts().head(10)

Average Screen Time  Status
19.0                 0         32
33.0                 0         27
24.0                 0         27
30.0                 0         26
18.0                 0         26
34.0                 0         26
8.0                  0         25
31.0                 0         25
14.0                 0         25
35.0                 0         24
dtype: int64

In [9]:
df[['Average Screen Time','Status']].value_counts().tail(10)

Average Screen Time  Status
5.0                  1         11
48.0                 0         11
27.0                 0         10
4.0                  0          8
5.0                  0          7
1.0                  0          7
38.0                 0          4
2.0                  0          4
3.0                  0          3
0.0                  0          3
dtype: int64

In [10]:
df.corr()

Unnamed: 0,userid,Average Screen Time,Average Spent on App (INR),Left Review,Ratings,New Password Request,Last Visited Minutes,Status
userid,1.0,-0.011205,0.007211,0.011871,-0.010577,0.011876,-0.031093,-0.020247
Average Screen Time,-0.011205,1.0,0.44592,-0.019984,0.557624,-0.261706,-0.458216,-0.467834
Average Spent on App (INR),0.007211,0.44592,1.0,-0.022998,0.482451,-0.207935,-0.350524,-0.359042
Left Review,0.011871,-0.019984,-0.022998,1.0,0.003547,0.056722,-0.004401,-0.016627
Ratings,-0.010577,0.557624,0.482451,0.003547,1.0,-0.213145,-0.416936,-0.442731
New Password Request,0.011876,-0.261706,-0.207935,0.056722,-0.213145,1.0,0.464466,0.510563
Last Visited Minutes,-0.031093,-0.458216,-0.350524,-0.004401,-0.416936,0.464466,1.0,0.916728
Status,-0.020247,-0.467834,-0.359042,-0.016627,-0.442731,0.510563,0.916728,1.0


# Building ML Model

In [11]:
X =df[['userid','Average Screen Time','Average Spent on App (INR)','Left Review','Ratings','New Password Request','Last Visited Minutes']]
y=df['Status']

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=22)

In [13]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
from sklearn.neighbors import KNeighborsClassifier

In [15]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train_scaled,y_train)
model.score(X_test_scaled,y_test)

0.984

In [16]:
yp = model.predict(X_test_scaled)

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [18]:
acc=accuracy_score(y_test,yp)
pr=precision_score(y_test,yp)
re=recall_score(y_test,yp)
f1=f1_score(y_test,yp)
print(acc,pr,re,f1)

0.984 1.0 0.84 0.9130434782608696


In [19]:
clustering_data = df[["Average Screen Time", "Left Review", 
                        "Ratings", "Last Visited Minutes", 
                       "Average Spent on App (INR)", 
                       "New Password Request"]]

from sklearn.preprocessing import MinMaxScaler
for i in clustering_data.columns:
    MinMaxScaler(i)
   
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
clusters = kmeans.fit_predict(clustering_data)
df["Segments"] = clusters

In [20]:
print(df.head(10))

   userid  Average Screen Time  Average Spent on App (INR)  Left Review  \
0    1001                 17.0                       634.0            1   
1    1002                  0.0                        54.0            0   
2    1003                 37.0                       207.0            0   
3    1004                 32.0                       445.0            1   
4    1005                 45.0                       427.0            1   
5    1006                 28.0                       599.0            0   
6    1007                 49.0                       887.0            1   
7    1008                  8.0                        31.0            0   
8    1009                 28.0                       741.0            1   
9    1010                 28.0                       524.0            1   

   Ratings  New Password Request  Last Visited Minutes  Status  Segments  
0        9                     7                  2990       0         0  
1        4              

In [21]:
print(df["Segments"].value_counts())

0    910
1     45
2     44
Name: Segments, dtype: int64


In [22]:
sdf[['userid','Status','Segments']].to_csv('seg_user_table',index=False)