# Collaborative Filtering


In [1]:
!kaggle datasets download valakhorasani/mobile-device-usage-and-user-behavior-dataset

Dataset URL: https://www.kaggle.com/datasets/valakhorasani/mobile-device-usage-and-user-behavior-dataset
License(s): apache-2.0
Downloading mobile-device-usage-and-user-behavior-dataset.zip to /content
  0% 0.00/11.3k [00:00<?, ?B/s]
100% 11.3k/11.3k [00:00<00:00, 15.3MB/s]


In [2]:
!unzip mobile-device-usage-and-user-behavior-dataset.zip


Archive:  mobile-device-usage-and-user-behavior-dataset.zip
  inflating: user_behavior_dataset.csv  


In [10]:
!pip install surprise -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m122.9/154.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone


In [17]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import KNNBasic
from surprise import accuracy
from sklearn.preprocessing import LabelEncoder, MinMaxScaler


In [32]:
data = pd.read_csv('/content/user_behavior_dataset.csv')
data.head(5)

Unnamed: 0,User ID,Device Model,Operating System,App Usage Time (min/day),Screen On Time (hours/day),Battery Drain (mAh/day),Number of Apps Installed,Data Usage (MB/day),Age,Gender,User Behavior Class
0,1,Google Pixel 5,Android,393,6.4,1872,67,1122,40,Male,4
1,2,OnePlus 9,Android,268,4.7,1331,42,944,47,Female,3
2,3,Xiaomi Mi 11,Android,154,4.0,761,32,322,42,Male,2
3,4,Google Pixel 5,Android,239,4.8,1676,56,871,20,Male,3
4,5,iPhone 12,iOS,187,4.3,1367,58,988,31,Female,3


In [20]:
categorical_cols = data.select_dtypes(include=['object']).columns
numerical_cols = data.select_dtypes(include=['number']).columns

# Apply Label Encoding to categorical features
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le  # Store the encoder for later use if needed

# Apply Min-Max scaling to numerical features
scaler = MinMaxScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

print(data.head())


    User ID  Device Model  Operating System  App Usage Time (min/day)  \
0  0.000000             0                 0                  0.639085   
1  0.001431             1                 0                  0.419014   
2  0.002861             3                 0                  0.218310   
3  0.004292             0                 0                  0.367958   
4  0.005722             4                 1                  0.276408   

   Screen On Time (hours/day)  Battery Drain (mAh/day)  \
0                    0.490909                 0.583426   
1                    0.336364                 0.382386   
2                    0.272727                 0.170569   
3                    0.345455                 0.510591   
4                    0.300000                 0.395764   

   Number of Apps Installed  Data Usage (MB/day)       Age  Gender  \
0                  0.640449             0.425887  0.536585       1   
1                  0.359551             0.351566  0.707317       0   
2 

In [7]:
data.columns

Index(['User ID', 'Device Model', 'Operating System',
       'App Usage Time (min/day)', 'Screen On Time (hours/day)',
       'Battery Drain (mAh/day)', 'Number of Apps Installed',
       'Data Usage (MB/day)', 'Age', 'Gender', 'User Behavior Class'],
      dtype='object')

In [33]:
reader = Reader(rating_scale=(df["App Usage Time (min/day)"].min(), df["App Usage Time (min/day)"].max()))
data = Dataset.load_from_df(df[['User ID', 'Device Model', 'App Usage Time (min/day)']], reader)


In [34]:
trainset, testset = train_test_split(data, test_size=0.2)

# 🔹 User-Based Collaborative Filtering Model
sim_options = {
    "name": "pearson",
    "user_based": False,  # User-based collaborative filtering
}
model = KNNBasic(sim_options=sim_options)
model.fit(trainset)


Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7d3726e0ef50>

In [35]:
predictions = model.test(testset)

# 🔹 Evaluate Performance
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse}")

RMSE: 178.8986
RMSE: 178.8986095530091


In [36]:
def get_similar_devices(device_model, model, trainset):
    try:
        model_inner_id = trainset.to_inner_iid(device_model)
        neighbors = model.get_neighbors(model_inner_id, k=5)
        recommended_models = [trainset.to_raw_iid(inner_id) for inner_id in neighbors]
        return recommended_models
    except:
        return "Device Model not found in training data."

device_model = "iPhone 12"
print(f"Recommended Device Models for {device_model}:", get_similar_devices(device_model, model, trainset))

Recommended Device Models for iPhone 12: ['Xiaomi Mi 11', 'Samsung Galaxy S21', 'Google Pixel 5', 'OnePlus 9']
