# Data Exploration

## 1. Import libraries

In [1]:
import pandas as pd
import scipy.sparse as sparse
import implicit.als as als

## 2. Read the data

In [3]:
raw_data = pd.read_csv("skills_ordered.csv")
raw_data

Unnamed: 0,uuid,skill_name,level
0,06b36793-381f-4a5f-8544-dd6a02bb4513,OpenSSL,INTERMEDIATE
1,314911bb-28ae-44e0-a70e-a97bc01f0b18,OpenSSL,ADVANCED
2,c5a1f718-8a18-4b90-8d2a-cf22fb192dae,OpenStack,NOVICE
3,06b36793-381f-4a5f-8544-dd6a02bb4513,OpenStack,NOVICE
4,c5a1f718-8a18-4b90-8d2a-cf22fb192dae,ORACLE DB,ADVANCED
...,...,...,...
207,e0d0e4bd-af6c-4812-b463-d1f798cd3e74,Jest,INTERMEDIATE
208,e0d0e4bd-af6c-4812-b463-d1f798cd3e74,React,EXPERT
209,e0d0e4bd-af6c-4812-b463-d1f798cd3e74,Apollo,INTERMEDIATE
210,e0d0e4bd-af6c-4812-b463-d1f798cd3e74,JavaScript,EXPERT


In [4]:
# drop empty rows just in case
data = raw_data.dropna()
del raw_data
data = data.copy()
data

Unnamed: 0,uuid,skill_name,level
0,06b36793-381f-4a5f-8544-dd6a02bb4513,OpenSSL,INTERMEDIATE
1,314911bb-28ae-44e0-a70e-a97bc01f0b18,OpenSSL,ADVANCED
2,c5a1f718-8a18-4b90-8d2a-cf22fb192dae,OpenStack,NOVICE
3,06b36793-381f-4a5f-8544-dd6a02bb4513,OpenStack,NOVICE
4,c5a1f718-8a18-4b90-8d2a-cf22fb192dae,ORACLE DB,ADVANCED
...,...,...,...
207,e0d0e4bd-af6c-4812-b463-d1f798cd3e74,Jest,INTERMEDIATE
208,e0d0e4bd-af6c-4812-b463-d1f798cd3e74,React,EXPERT
209,e0d0e4bd-af6c-4812-b463-d1f798cd3e74,Apollo,INTERMEDIATE
210,e0d0e4bd-af6c-4812-b463-d1f798cd3e74,JavaScript,EXPERT


## 3. Transform the data

### 3.1.Transform the levels to numbers

In [5]:
str2num = { "NOVICE": 1, "INTERMEDIATE": 2, "ADVANCED": 3, "EXPERT": 4 }
data["rating"] = data["level"].apply(lambda x: str2num.get(x, None))
data["rating"]

0      2
1      3
2      1
3      1
4      3
      ..
207    2
208    4
209    2
210    4
211    3
Name: rating, Length: 209, dtype: int64

### 3.2. Transform user uuids to catergorical values

In [6]:
data["uuid"] = data["uuid"].astype("category")
data["uuid"]

0      06b36793-381f-4a5f-8544-dd6a02bb4513
1      314911bb-28ae-44e0-a70e-a97bc01f0b18
2      c5a1f718-8a18-4b90-8d2a-cf22fb192dae
3      06b36793-381f-4a5f-8544-dd6a02bb4513
4      c5a1f718-8a18-4b90-8d2a-cf22fb192dae
                       ...                 
207    e0d0e4bd-af6c-4812-b463-d1f798cd3e74
208    e0d0e4bd-af6c-4812-b463-d1f798cd3e74
209    e0d0e4bd-af6c-4812-b463-d1f798cd3e74
210    e0d0e4bd-af6c-4812-b463-d1f798cd3e74
211    e0d0e4bd-af6c-4812-b463-d1f798cd3e74
Name: uuid, Length: 209, dtype: category
Categories (11, object): ['06b36793-381f-4a5f-8544-dd6a02bb4513', '11c6f2e8-0110-4ac0-b2a7-9a889ce612f0', '314911bb-28ae-44e0-a70e-a97bc01f0b18', '51259f56-93cf-4c9d-89f3-cd3e767cb80f', ..., 'c5a1f718-8a18-4b90-8d2a-cf22fb192dae', 'c8d91564-77b4-46df-b7a4-8d25a82ad950', 'cd9974b9-da79-4b19-af78-fefac8849238', 'e0d0e4bd-af6c-4812-b463-d1f798cd3e74']

### 3.3. Transform skill names to categorical values

In [7]:
data["skill_name"] = data["skill_name"].astype("category")
data["skill_name"]

0         OpenSSL
1         OpenSSL
2       OpenStack
3       OpenStack
4       ORACLE DB
          ...    
207          Jest
208         React
209        Apollo
210    JavaScript
211       Node.js
Name: skill_name, Length: 209, dtype: category
Categories (124, object): ['Apollo', 'Express', 'JavaScript', 'Jest', ..., 'puppet', 'sofware development', 'spreadsheet', 'yarn']

### 3.4. Normalize user_ids as categorical codes

In [8]:
data["user_id"] = data["uuid"].cat.codes
data["user_id"]

0       0
1       2
2       7
3       0
4       7
       ..
207    10
208    10
209    10
210    10
211    10
Name: user_id, Length: 209, dtype: int8

### 3.5. Normalize skill_ids as categorical codes

In [9]:
data["skill_id"] = data["skill_name"].cat.codes
data["skill_id"]

0       7
1       7
2       8
3       8
4       5
       ..
207     3
208    53
209     0
210     2
211     4
Name: skill_id, Length: 209, dtype: int8

## 4. Create sparse matrixes

### 4.1 Sparse item user

In [10]:
sparse_item_user = sparse.csr_matrix(
    (
        data["rating"].astype(float),
        (data["skill_id"], data["user_id"]),
    )
)
sparse_item_user

<124x11 sparse matrix of type '<class 'numpy.float64'>'
	with 209 stored elements in Compressed Sparse Row format>

### 4.2. Sparse user item

In [11]:
sparse_user_item = sparse.csr_matrix(
    (
        data["rating"].astype(float),
        (data["user_id"], data["skill_id"]),
    )
)
sparse_user_item

<11x124 sparse matrix of type '<class 'numpy.float64'>'
	with 209 stored elements in Compressed Sparse Row format>

## 5. Training the model

In [12]:
recommender = als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)
alpha = 40
data_confidence = (sparse_item_user * alpha).astype('double')
recommender.fit(data_confidence)



  0%|          | 0/20 [00:00<?, ?it/s]

In [13]:
from projects.recommender.gradient_descent import GradientDescentMF


recommender = GradientDescentMF(
    user_item=sparse_user_item, verbose=True, features=3, iterations=200
)
recommender.train()

self._user_item.shape = (11, 124)
self._users_count = 11
self._items_count = 124
Mean Squared Error in iteration #0
	764.1285
Mean Squared Error in iteration #50
	390.7465
Mean Squared Error in iteration #100
	382.0865
Mean Squared Error in iteration #150
	380.9833
trained_model = array([[0.05580918, 0.08342113, 0.11132165, ..., 0.03433215, 0.01950479,
        0.01728021],
       [0.0900105 , 0.13453207, 0.17953058, ..., 0.06825197, 0.03035246,
        0.03431464],
       [0.08617674, 0.1288482 , 0.17193076, ..., 0.05447641, 0.02825636,
        0.02740081],
       ...,
       [0.07597496, 0.11333129, 0.15130998, ..., 0.05914259, 0.03608556,
        0.02981746],
       [0.27957803, 0.41759229, 0.55735657, ..., 0.21451469, 0.10698467,
        0.10794956],
       [0.13013556, 0.19460046, 0.25965984, ..., 0.09186438, 0.04015087,
        0.04616733]])
self._user_item = array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 2., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       ..

## 6. Making recommendations

In [15]:
users_asc = data.uuid.cat.categories
skills_asc = data.skill_name.cat.categories
user2code = {user: code for code, user in enumerate(users_asc)}
skill2code = {skill: code for code, skill in enumerate(skills_asc)}
code2user = {code: user for code, user in enumerate(users_asc)}
code2skill = {code: skill for code, skill in enumerate(skills_asc)}

USER_UUID = "e0d0e4bd-af6c-4812-b463-d1f798cd3e74"
user_id = user2code[USER_UUID]
recommendations = recommender_als.recommend(user_id, sparse_user_item, N=4)
[code2skill[idx] for idx, _ in recommendations]

TypeError: recommend() got an unexpected keyword argument 'N'

In [14]:
recommendations = recommender.recommend(user_id, sparse_item_user, n=4)
print(
    "Recommendations: ",
    ", ".join([code2skill[idx] for idx, _ in recommendations])
)


NameError: name 'user_id' is not defined