# Dependências

In [2]:
import os
import sys
import pandas as pd
import numpy as np

from langchain_openai import OpenAIEmbeddings

from utils import load_data

from dotenv import load_dotenv
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Tomando os usuários

In [3]:
DATA_DIR = "data"
users = load_data(DATA_DIR, "users.dat")

In [4]:
users

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [5]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserID      6040 non-null   int64 
 1   Gender      6040 non-null   object
 2   Age         6040 non-null   int64 
 3   Occupation  6040 non-null   int64 
 4   Zip-code    6040 non-null   object
dtypes: int64(3), object(2)
memory usage: 236.1+ KB


In [6]:
user_num = 6040

### Dicionários de características
As informações foram extraídas da descrição fornecida pelo próprio grupo MovieLens. Está disponível em <a href="https://files.grouplens.org/datasets/movielens/ml-1m-README.txt">README - MovieLens 1M Dataset</a>.

In [7]:
gender_user_dict = {
    "M": "male",
    "F": "female"
}

In [8]:
age_user_dict = {
    1: "under 18",
    18: "18-24",
    25: "25-34",
    35: "35-44",
    45: "45-49",
    50: "50-55",
    56: "56+"
}

In [9]:
occupation_user_dict = {
    0:  "other or not specified",
	1:  "academic/educator",
	2:  "artist",
	3:  "clerical/admin",
	4:  "college/grad student",
	5:  "customer service",
	6:  "doctor/health care",
	7:  "executive/managerial",
	8:  "farmer",
	9:  "homemaker",
	10:  "K-12 student",
	11:  "lawyer",
	12:  "programmer",
	13:  "retired",
	14:  "sales/marketing",
	15:  "scientist",
	16:  "self-employed",
	17:  "technician/engineer",
	18:  "tradesman/craftsman",
	19:  "unemployed",
	20:  "writer"
}

## Gerando descrições de cada usuário

In [22]:
user_descriptions = [] # list of user descriptions
for i in range(1, user_num+1): # for each user
    user = users[users["UserID"] == i]
    description = f"Este é o usuário de ID {i}. Seu gênero é {gender_user_dict[user['Gender'].values[0]]} e sua idade está numa faixa etária de {age_user_dict[user['Age'].values[0]]} anos. Sua ocupação é como {occupation_user_dict[user['Occupation'].values[0]]} e seu ZIP-CODE é {user['Zip-code'].values[0]}."
    user_descriptions.append(description)

In [23]:
user_descriptions[:5]

['Este é o usuário de ID 1. Seu gênero é female e sua idade está numa faixa etária de under 18 anos. Sua ocupação é como K-12 student e seu ZIP-CODE é 48067.',
 'Este é o usuário de ID 2. Seu gênero é male e sua idade está numa faixa etária de 56+ anos. Sua ocupação é como self-employed e seu ZIP-CODE é 70072.',
 'Este é o usuário de ID 3. Seu gênero é male e sua idade está numa faixa etária de 25-34 anos. Sua ocupação é como scientist e seu ZIP-CODE é 55117.',
 'Este é o usuário de ID 4. Seu gênero é male e sua idade está numa faixa etária de 45-49 anos. Sua ocupação é como executive/managerial e seu ZIP-CODE é 02460.',
 'Este é o usuário de ID 5. Seu gênero é male e sua idade está numa faixa etária de 25-34 anos. Sua ocupação é como writer e seu ZIP-CODE é 55455.']

In [24]:
df_users_with_descriptions = pd.DataFrame(users["UserID"])
df_users_with_descriptions["Description_User"] = pd.Series(user_descriptions)

In [25]:
df_users_with_descriptions

Unnamed: 0,UserID,Description_User
0,1,Este é o usuário de ID 1. Seu gênero é female ...
1,2,Este é o usuário de ID 2. Seu gênero é male e ...
2,3,Este é o usuário de ID 3. Seu gênero é male e ...
3,4,Este é o usuário de ID 4. Seu gênero é male e ...
4,5,Este é o usuário de ID 5. Seu gênero é male e ...
...,...,...
6035,6036,Este é o usuário de ID 6036. Seu gênero é fema...
6036,6037,Este é o usuário de ID 6037. Seu gênero é fema...
6037,6038,Este é o usuário de ID 6038. Seu gênero é fema...
6038,6039,Este é o usuário de ID 6039. Seu gênero é fema...


### Embeddings

Aqui, nós usaremos um modelo de embedding da `OpenAI`.

In [26]:
model_embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY, model="text-embedding-3-large")

In [27]:
embeddings_descriptions = model_embedding.embed_documents(user_descriptions)

In [28]:
embeddings_descriptions = np.array(embeddings_descriptions)
embeddings_descriptions

array([[-0.02302216,  0.03032926,  0.00171841, ...,  0.00293109,
        -0.00344713,  0.00403541],
       [-0.00532234,  0.02537519,  0.00139191, ...,  0.00616235,
        -0.01314456,  0.0010643 ],
       [-0.00721926,  0.01973486, -0.0007369 , ..., -0.00571885,
         0.00190961,  0.00043166],
       ...,
       [-0.03463866,  0.02869381,  0.00471625, ..., -0.00058045,
        -0.00818408, -0.01041009],
       [-0.02690348,  0.0288362 ,  0.00546638, ..., -0.00805945,
        -0.00403617, -0.00503152],
       [-0.01815131,  0.03291802,  0.00050923, ..., -0.00407381,
        -0.00574223, -0.01347018]])

In [29]:
embeddings_descriptions.shape

(6040, 3072)

In [30]:
embeddings_descriptions_df = pd.DataFrame(embeddings_descriptions)
embeddings_descriptions_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3062,3063,3064,3065,3066,3067,3068,3069,3070,3071
0,-0.023022,0.030329,0.001718,-0.003777,-0.006334,-0.015729,0.022197,0.019926,0.035366,0.033192,...,0.011429,0.008153,0.000868,-0.013266,0.014002,-0.006423,0.002007,0.002931,-0.003447,0.004035
1,-0.005322,0.025375,0.001392,-0.003464,0.010504,0.008703,0.036557,0.021168,0.016881,0.049944,...,0.013716,0.010268,-0.013138,-0.002495,0.019139,-0.02551,-0.008353,0.006162,-0.013145,0.001064
2,-0.007219,0.019735,-0.000737,-0.004035,-0.008756,0.012476,0.010573,0.026389,0.033641,0.045431,...,-0.012163,0.010054,-0.007213,-0.015024,0.018936,-0.026628,-0.0028,-0.005719,0.00191,0.000432
3,-0.028621,0.039094,0.001921,0.011913,0.002255,0.008403,0.021449,0.029614,0.010025,0.03764,...,0.002309,0.012416,-0.012374,-0.001,0.008998,-0.004058,-0.008173,0.001725,-0.002826,-0.010543
4,-0.02212,0.033352,-0.001732,-0.010144,-0.014896,0.018324,0.018429,0.025996,0.006647,0.029661,...,-0.004502,0.010269,-0.013789,-0.010065,0.008404,-0.024295,-0.009393,-0.011845,-0.014277,-0.007231


In [31]:
df_users_with_descriptions_and_embeddings = pd.concat([df_users_with_descriptions, embeddings_descriptions_df], axis=1)
df_users_with_descriptions_and_embeddings.head(5)

Unnamed: 0,UserID,Description_User,0,1,2,3,4,5,6,7,...,3062,3063,3064,3065,3066,3067,3068,3069,3070,3071
0,1,Este é o usuário de ID 1. Seu gênero é female ...,-0.023022,0.030329,0.001718,-0.003777,-0.006334,-0.015729,0.022197,0.019926,...,0.011429,0.008153,0.000868,-0.013266,0.014002,-0.006423,0.002007,0.002931,-0.003447,0.004035
1,2,Este é o usuário de ID 2. Seu gênero é male e ...,-0.005322,0.025375,0.001392,-0.003464,0.010504,0.008703,0.036557,0.021168,...,0.013716,0.010268,-0.013138,-0.002495,0.019139,-0.02551,-0.008353,0.006162,-0.013145,0.001064
2,3,Este é o usuário de ID 3. Seu gênero é male e ...,-0.007219,0.019735,-0.000737,-0.004035,-0.008756,0.012476,0.010573,0.026389,...,-0.012163,0.010054,-0.007213,-0.015024,0.018936,-0.026628,-0.0028,-0.005719,0.00191,0.000432
3,4,Este é o usuário de ID 4. Seu gênero é male e ...,-0.028621,0.039094,0.001921,0.011913,0.002255,0.008403,0.021449,0.029614,...,0.002309,0.012416,-0.012374,-0.001,0.008998,-0.004058,-0.008173,0.001725,-0.002826,-0.010543
4,5,Este é o usuário de ID 5. Seu gênero é male e ...,-0.02212,0.033352,-0.001732,-0.010144,-0.014896,0.018324,0.018429,0.025996,...,-0.004502,0.010269,-0.013789,-0.010065,0.008404,-0.024295,-0.009393,-0.011845,-0.014277,-0.007231


In [32]:
df_users_with_descriptions_and_embeddings.to_csv(os.path.join(DATA_DIR, "users_with_descriptions_and_embeddings.csv"))