# Data Exploration

## 1. Import libraries

In [2]:
import pandas as pd
import scipy.sparse as sparse

## 2. Read the data

In [3]:
raw_data = pd.read_csv("skills.csv")
raw_data

Unnamed: 0,uuid,skill_name,level
0,c0ccd057-5aed-4f2f-83a7-fedd88798a99,Groovy,ADVANCED
1,c0ccd057-5aed-4f2f-83a7-fedd88798a99,Bash,ADVANCED
2,c0ccd057-5aed-4f2f-83a7-fedd88798a99,JSON,ADVANCED
3,c0ccd057-5aed-4f2f-83a7-fedd88798a99,Leadership,INTERMEDIATE
4,164e77f7-03cc-4e00-856a-102286c21f80,JSON,ADVANCED
...,...,...,...
995,285dc44d-cbfa-4faa-9087-f6963659fdf4,Serverless,NOVICE
996,aa8f222e-b28d-4713-97c2-9cd21cacc02e,Communication,NOVICE
997,10ac50c3-b7f4-4d23-8b10-edd530cf2479,MongoDB,NOVICE
998,b52dcda7-518e-4f71-891d-4ae089489164,User Research,NOVICE


In [4]:
# drop empty rows just in case
data = raw_data.dropna()
del raw_data
data = data.copy()
data

Unnamed: 0,uuid,skill_name,level
0,c0ccd057-5aed-4f2f-83a7-fedd88798a99,Groovy,ADVANCED
1,c0ccd057-5aed-4f2f-83a7-fedd88798a99,Bash,ADVANCED
2,c0ccd057-5aed-4f2f-83a7-fedd88798a99,JSON,ADVANCED
3,c0ccd057-5aed-4f2f-83a7-fedd88798a99,Leadership,INTERMEDIATE
4,164e77f7-03cc-4e00-856a-102286c21f80,JSON,ADVANCED
...,...,...,...
995,285dc44d-cbfa-4faa-9087-f6963659fdf4,Serverless,NOVICE
996,aa8f222e-b28d-4713-97c2-9cd21cacc02e,Communication,NOVICE
997,10ac50c3-b7f4-4d23-8b10-edd530cf2479,MongoDB,NOVICE
998,b52dcda7-518e-4f71-891d-4ae089489164,User Research,NOVICE


## 3. Transform the data

### 3.1.Transform the levels to numbers

In [5]:
str2num = { "NOVICE": 1, "INTERMEDIATE": 2, "ADVANCED": 3, "EXPERT": 4 }
data["rating"] = data["level"].apply(lambda x: str2num.get(x, None))
data["rating"]

0      3
1      3
2      3
3      2
4      3
      ..
995    1
996    1
997    1
998    1
999    1
Name: rating, Length: 1000, dtype: int64

### 3.2. Transform user uuids to catergorical values

In [6]:
data["uuid"] = data["uuid"].astype("category")
data["uuid"]

0      c0ccd057-5aed-4f2f-83a7-fedd88798a99
1      c0ccd057-5aed-4f2f-83a7-fedd88798a99
2      c0ccd057-5aed-4f2f-83a7-fedd88798a99
3      c0ccd057-5aed-4f2f-83a7-fedd88798a99
4      164e77f7-03cc-4e00-856a-102286c21f80
                       ...                 
995    285dc44d-cbfa-4faa-9087-f6963659fdf4
996    aa8f222e-b28d-4713-97c2-9cd21cacc02e
997    10ac50c3-b7f4-4d23-8b10-edd530cf2479
998    b52dcda7-518e-4f71-891d-4ae089489164
999    83f5a976-ba1b-4b3a-8862-0c03e5d8eb6c
Name: uuid, Length: 1000, dtype: category
Categories (198, object): ['00398a29-9972-4fbf-b319-d602c2906867', '0120e441-91c9-4e38-8065-f37db773ec9d', '0197e4e7-5feb-4bf8-a375-9dd63acfe72f', '01df1992-e62a-4c58-936a-50099ca9ac52', ..., 'fb11b5f8-2892-42f7-9e70-07b3e832516e', 'fb3004a1-cc5e-4ec8-ac3e-4d30d38018ae', 'fe233c78-e3d4-494c-bc1a-c1d94bd3439c', 'ff1ecb31-87d7-4456-a448-7bc92a0f2ec4']

### 3.3. Transform skill names to categorical values

In [7]:
data["skill_name"] = data["skill_name"].astype("category")
data["skill_name"]

0             Groovy
1               Bash
2               JSON
3         Leadership
4               JSON
           ...      
995       Serverless
996    Communication
997          MongoDB
998    User Research
999              AWS
Name: skill_name, Length: 1000, dtype: category
Categories (304, object): ['.NET', '.NET Core', 'A/B Testing', 'ASP', ..., 'jQuery', 'nginx', 'oauth2', 'puppet']

### 3.4. Normalize user_ids as categorical codes

In [8]:
data["user_id"] = data["uuid"].cat.codes
data["user_id"]

0      157
1      157
2      157
3      157
4       20
      ... 
995     34
996    134
997     15
998    147
999    104
Name: user_id, Length: 1000, dtype: int16

### 3.5. Normalize skill_ids as categorical codes

In [9]:
data["skill_id"] = data["skill_name"].cat.codes
data["skill_id"]

0      112
1       50
2      129
3      146
4      129
      ... 
995    233
996     65
997    156
998    272
999      4
Name: skill_id, Length: 1000, dtype: int16

## 4. Create sparse matrixes

### 4.1 Sparse item user

In [15]:
sparse_item_user = sparse.csr_matrix(
    (
        data["rating"].astype(float),
        (data["skill_id"], data["user_id"]),
    )
)
sparse_item_user

<304x198 sparse matrix of type '<class 'numpy.float64'>'
	with 1000 stored elements in Compressed Sparse Row format>

### 4.2. Sparse user item

In [17]:
sparse_user_item = sparse.csr_matrix(
    (
        data["rating"].astype(float),
        (data["user_id"], data["skill_id"]),
    )
)
sparse_user_item

<198x304 sparse matrix of type '<class 'numpy.float64'>'
	with 1000 stored elements in Compressed Sparse Row format>