# Data Exploration

## 1. Import libraries

In [1]:
import pandas as pd
import scipy.sparse as sparse
import implicit.als as als

## 2. Read the data

In [2]:
raw_data = pd.read_csv("skills.csv")
raw_data

Unnamed: 0,uuid,skill_name,level
0,c0ccd057-5aed-4f2f-83a7-fedd88798a99,Groovy,ADVANCED
1,c0ccd057-5aed-4f2f-83a7-fedd88798a99,Bash,ADVANCED
2,c0ccd057-5aed-4f2f-83a7-fedd88798a99,JSON,ADVANCED
3,c0ccd057-5aed-4f2f-83a7-fedd88798a99,Leadership,INTERMEDIATE
4,164e77f7-03cc-4e00-856a-102286c21f80,JSON,ADVANCED
...,...,...,...
1001,e0d0e4bd-af6c-4812-b463-d1f798cd3e74,Jest,INTERMEDIATE
1002,e0d0e4bd-af6c-4812-b463-d1f798cd3e74,Redux,INTERMEDIATE
1003,e0d0e4bd-af6c-4812-b463-d1f798cd3e74,Apollo,INTERMEDIATE
1004,e0d0e4bd-af6c-4812-b463-d1f798cd3e74,JavaScript,ADVANCED


In [3]:
# drop empty rows just in case
data = raw_data.dropna()
del raw_data
data = data.copy()
data

Unnamed: 0,uuid,skill_name,level
0,c0ccd057-5aed-4f2f-83a7-fedd88798a99,Groovy,ADVANCED
1,c0ccd057-5aed-4f2f-83a7-fedd88798a99,Bash,ADVANCED
2,c0ccd057-5aed-4f2f-83a7-fedd88798a99,JSON,ADVANCED
3,c0ccd057-5aed-4f2f-83a7-fedd88798a99,Leadership,INTERMEDIATE
4,164e77f7-03cc-4e00-856a-102286c21f80,JSON,ADVANCED
...,...,...,...
1001,e0d0e4bd-af6c-4812-b463-d1f798cd3e74,Jest,INTERMEDIATE
1002,e0d0e4bd-af6c-4812-b463-d1f798cd3e74,Redux,INTERMEDIATE
1003,e0d0e4bd-af6c-4812-b463-d1f798cd3e74,Apollo,INTERMEDIATE
1004,e0d0e4bd-af6c-4812-b463-d1f798cd3e74,JavaScript,ADVANCED


## 3. Transform the data

### 3.1.Transform the levels to numbers

In [4]:
str2num = { "NOVICE": 1, "INTERMEDIATE": 2, "ADVANCED": 3, "EXPERT": 4 }
data["rating"] = data["level"].apply(lambda x: str2num.get(x, None))
data["rating"]

0       3
1       3
2       3
3       2
4       3
       ..
1001    2
1002    2
1003    2
1004    3
1005    3
Name: rating, Length: 1006, dtype: int64

### 3.2. Transform user uuids to catergorical values

In [5]:
data["uuid"] = data["uuid"].astype("category")
data["uuid"]

0       c0ccd057-5aed-4f2f-83a7-fedd88798a99
1       c0ccd057-5aed-4f2f-83a7-fedd88798a99
2       c0ccd057-5aed-4f2f-83a7-fedd88798a99
3       c0ccd057-5aed-4f2f-83a7-fedd88798a99
4       164e77f7-03cc-4e00-856a-102286c21f80
                        ...                 
1001    e0d0e4bd-af6c-4812-b463-d1f798cd3e74
1002    e0d0e4bd-af6c-4812-b463-d1f798cd3e74
1003    e0d0e4bd-af6c-4812-b463-d1f798cd3e74
1004    e0d0e4bd-af6c-4812-b463-d1f798cd3e74
1005    e0d0e4bd-af6c-4812-b463-d1f798cd3e74
Name: uuid, Length: 1006, dtype: category
Categories (199, object): ['00398a29-9972-4fbf-b319-d602c2906867', '0120e441-91c9-4e38-8065-f37db773ec9d', '0197e4e7-5feb-4bf8-a375-9dd63acfe72f', '01df1992-e62a-4c58-936a-50099ca9ac52', ..., 'fb11b5f8-2892-42f7-9e70-07b3e832516e', 'fb3004a1-cc5e-4ec8-ac3e-4d30d38018ae', 'fe233c78-e3d4-494c-bc1a-c1d94bd3439c', 'ff1ecb31-87d7-4456-a448-7bc92a0f2ec4']

### 3.3. Transform skill names to categorical values

In [6]:
data["skill_name"] = data["skill_name"].astype("category")
data["skill_name"]

0           Groovy
1             Bash
2             JSON
3       Leadership
4             JSON
           ...    
1001          Jest
1002         Redux
1003        Apollo
1004    JavaScript
1005       Node.js
Name: skill_name, Length: 1006, dtype: category
Categories (304, object): ['.NET', '.NET Core', 'A/B Testing', 'ASP', ..., 'jQuery', 'nginx', 'oauth2', 'puppet']

### 3.4. Normalize user_ids as categorical codes

In [7]:
data["user_id"] = data["uuid"].cat.codes
data["user_id"]

0       157
1       157
2       157
3       157
4        20
       ... 
1001    180
1002    180
1003    180
1004    180
1005    180
Name: user_id, Length: 1006, dtype: int16

### 3.5. Normalize skill_ids as categorical codes

In [8]:
data["skill_id"] = data["skill_name"].cat.codes
data["skill_id"]

0       112
1        50
2       129
3       146
4       129
       ... 
1001    136
1002    208
1003     38
1004    133
1005    169
Name: skill_id, Length: 1006, dtype: int16

## 4. Create sparse matrixes

### 4.1 Sparse item user

In [10]:
sparse_item_user = sparse.csr_matrix(
    (
        data["rating"].astype(float),
        (data["skill_id"], data["user_id"]),
    )
)
sparse_item_user

<304x199 sparse matrix of type '<class 'numpy.float64'>'
	with 1006 stored elements in Compressed Sparse Row format>

### 4.2. Sparse user item

In [11]:
sparse_user_item = sparse.csr_matrix(
    (
        data["rating"].astype(float),
        (data["user_id"], data["skill_id"]),
    )
)
sparse_user_item

<199x304 sparse matrix of type '<class 'numpy.float64'>'
	with 1006 stored elements in Compressed Sparse Row format>

## 5. Training the model

In [12]:
recommender = als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)
alpha = 40
data_confidence = (sparse_item_user * alpha).astype('double')
recommender.fit(data_confidence)



  0%|          | 0/20 [00:00<?, ?it/s]

## 6. Making recommendations

In [19]:
users_asc = data.uuid.cat.categories
skills_asc = data.skill_name.cat.categories
user2code = {user: code for code, user in enumerate(users_asc)}
skill2code = {skill: code for code, skill in enumerate(skills_asc)}
code2user = {code: user for code, user in enumerate(users_asc)}
code2skill = {code: skill for code, skill in enumerate(skills_asc)}

USER_UUID = "e0d0e4bd-af6c-4812-b463-d1f798cd3e74"
user_id = user2code[USER_UUID]
recommendations = recommender.recommend(user_id, sparse_user_item, N=4)
[code2skill[idx] for idx, _ in recommendations]

['Serverless', 'Firebase', 'GraphQL', 'Angular']