# Dependências

In [1]:
import os
import sys
import pandas as pd
import numpy as np

from langchain_openai import OpenAIEmbeddings

from utils import load_data

from dotenv import load_dotenv
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Tomando os usuários

In [2]:
DATA_DIR = "data"
users = load_data(DATA_DIR, "users.dat")

In [3]:
users

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [4]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserID      6040 non-null   int64 
 1   Gender      6040 non-null   object
 2   Age         6040 non-null   int64 
 3   Occupation  6040 non-null   int64 
 4   Zip-code    6040 non-null   object
dtypes: int64(3), object(2)
memory usage: 236.1+ KB


In [5]:
user_num = 6040

### Dicionários de características
As informações foram extraídas da descrição fornecida pelo próprio grupo MovieLens. Está disponível em <a href="https://files.grouplens.org/datasets/movielens/ml-1m-README.txt">README - MovieLens 1M Dataset</a>.

In [6]:
gender_user_dict = {
    "M": "male",
    "F": "female"
}

In [7]:
age_user_dict = {
    1: "under 18",
    18: "18-24",
    25: "25-34",
    35: "35-44",
    45: "45-49",
    50: "50-55",
    56: "56+"
}

In [8]:
occupation_user_dict = {
    0:  "other or not specified",
	1:  "academic/educator",
	2:  "artist",
	3:  "clerical/admin",
	4:  "college/grad student",
	5:  "customer service",
	6:  "doctor/health care",
	7:  "executive/managerial",
	8:  "farmer",
	9:  "homemaker",
	10:  "K-12 student",
	11:  "lawyer",
	12:  "programmer",
	13:  "retired",
	14:  "sales/marketing",
	15:  "scientist",
	16:  "self-employed",
	17:  "technician/engineer",
	18:  "tradesman/craftsman",
	19:  "unemployed",
	20:  "writer"
}

## Gerando descrições de cada usuário

In [9]:
user_descriptions = [] # list of user descriptions
for i in range(1, user_num+1): # for each user
    user = users[users["UserID"] == i]
    description = f"Este é o usuário de ID {i}. Seu gênero é {gender_user_dict[user['Gender'].values[0]]} e sua idade está numa faixa etária de {age_user_dict[user['Age'].values[0]]} anos. Sua ocupação é como {occupation_user_dict[user['Occupation'].values[0]]} e seu ZIP-CODE é {user['Zip-code'].values[0]}."
    user_descriptions.append(description)

In [10]:
user_descriptions[:5]

['Este é o usuário de ID 1. Seu gênero é female e sua idade está numa faixa etária de under 18 anos. Sua ocupação é como K-12 student e seu ZIP-CODE é 48067.',
 'Este é o usuário de ID 2. Seu gênero é male e sua idade está numa faixa etária de 56+ anos. Sua ocupação é como self-employed e seu ZIP-CODE é 70072.',
 'Este é o usuário de ID 3. Seu gênero é male e sua idade está numa faixa etária de 25-34 anos. Sua ocupação é como scientist e seu ZIP-CODE é 55117.',
 'Este é o usuário de ID 4. Seu gênero é male e sua idade está numa faixa etária de 45-49 anos. Sua ocupação é como executive/managerial e seu ZIP-CODE é 02460.',
 'Este é o usuário de ID 5. Seu gênero é male e sua idade está numa faixa etária de 25-34 anos. Sua ocupação é como writer e seu ZIP-CODE é 55455.']

In [11]:
df_users_with_descriptions = pd.DataFrame(users["UserID"])
df_users_with_descriptions["Description_User"] = pd.Series(user_descriptions)

In [12]:
df_users_with_descriptions

Unnamed: 0,UserID,Description_User
0,1,Este é o usuário de ID 1. Seu gênero é female ...
1,2,Este é o usuário de ID 2. Seu gênero é male e ...
2,3,Este é o usuário de ID 3. Seu gênero é male e ...
3,4,Este é o usuário de ID 4. Seu gênero é male e ...
4,5,Este é o usuário de ID 5. Seu gênero é male e ...
...,...,...
6035,6036,Este é o usuário de ID 6036. Seu gênero é fema...
6036,6037,Este é o usuário de ID 6037. Seu gênero é fema...
6037,6038,Este é o usuário de ID 6038. Seu gênero é fema...
6038,6039,Este é o usuário de ID 6039. Seu gênero é fema...


### Embeddings

Aqui, nós usaremos um modelo de embedding da `OpenAI`.

In [13]:
model_embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY, model="text-embedding-3-large", dimensions=256)

In [14]:
embeddings_descriptions = model_embedding.embed_documents(user_descriptions)

In [15]:
embeddings_descriptions = np.array(embeddings_descriptions)
embeddings_descriptions

array([[-0.05712177,  0.07529843,  0.00435234, ...,  0.02221781,
         0.04794818,  0.02709447],
       [-0.01077184,  0.05655822,  0.00297092, ..., -0.02439577,
         0.05681604,  0.03272642],
       [-0.01750425,  0.04787075, -0.00171154, ..., -0.01208857,
         0.03086615,  0.01022693],
       ...,
       [-0.08816902,  0.07303704,  0.01200471, ...,  0.02252984,
         0.02232808,  0.00092841],
       [-0.06908121,  0.07418606,  0.01401348, ...,  0.00601643,
         0.03444116,  0.00824151],
       [-0.04568911,  0.08305235,  0.0013579 , ...,  0.02776439,
         0.03808573, -0.02824606]])

In [16]:
embeddings_descriptions.shape

(6040, 256)

In [17]:
embeddings_descriptions_df = pd.DataFrame(embeddings_descriptions)
embeddings_descriptions_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
0,-0.057122,0.075298,0.004352,-0.009412,-0.015738,-0.038979,0.054973,0.049346,0.087575,0.082324,...,-0.040991,-0.075571,-0.008645,0.004374,0.070797,0.007613,0.083756,0.022218,0.047948,0.027094
1,-0.010772,0.056558,0.002971,-0.006776,0.023203,0.022011,0.086239,0.050919,0.040896,0.121624,...,-0.080761,-0.063326,-0.042024,0.050113,0.046536,0.008862,0.09797,-0.024396,0.056816,0.032726
2,-0.017504,0.047871,-0.001712,-0.009784,-0.021276,0.030254,0.025563,0.063957,0.081622,0.110119,...,-0.051223,-0.068534,-0.041843,0.035331,0.030979,0.019342,0.09961,-0.012089,0.030866,0.010227
3,-0.07145,0.097593,0.004778,0.029669,0.005703,0.021013,0.053544,0.073858,0.025114,0.093894,...,-0.078885,-0.043247,-0.078745,0.051415,0.003763,0.004712,0.075394,-0.007788,0.009895,-0.00829
4,-0.053805,0.081029,-0.004258,-0.024752,-0.036245,0.044592,0.044816,0.063308,0.016116,0.072104,...,-0.068509,-0.089697,-0.072361,0.01297,0.022103,0.031734,0.172332,0.011926,-0.00238,0.033484


In [18]:
df_users_with_descriptions_and_embeddings = pd.concat([df_users_with_descriptions, embeddings_descriptions_df], axis=1)
df_users_with_descriptions_and_embeddings.head(5)

Unnamed: 0,UserID,Description_User,0,1,2,3,4,5,6,7,...,246,247,248,249,250,251,252,253,254,255
0,1,Este é o usuário de ID 1. Seu gênero é female ...,-0.057122,0.075298,0.004352,-0.009412,-0.015738,-0.038979,0.054973,0.049346,...,-0.040991,-0.075571,-0.008645,0.004374,0.070797,0.007613,0.083756,0.022218,0.047948,0.027094
1,2,Este é o usuário de ID 2. Seu gênero é male e ...,-0.010772,0.056558,0.002971,-0.006776,0.023203,0.022011,0.086239,0.050919,...,-0.080761,-0.063326,-0.042024,0.050113,0.046536,0.008862,0.09797,-0.024396,0.056816,0.032726
2,3,Este é o usuário de ID 3. Seu gênero é male e ...,-0.017504,0.047871,-0.001712,-0.009784,-0.021276,0.030254,0.025563,0.063957,...,-0.051223,-0.068534,-0.041843,0.035331,0.030979,0.019342,0.09961,-0.012089,0.030866,0.010227
3,4,Este é o usuário de ID 4. Seu gênero é male e ...,-0.07145,0.097593,0.004778,0.029669,0.005703,0.021013,0.053544,0.073858,...,-0.078885,-0.043247,-0.078745,0.051415,0.003763,0.004712,0.075394,-0.007788,0.009895,-0.00829
4,5,Este é o usuário de ID 5. Seu gênero é male e ...,-0.053805,0.081029,-0.004258,-0.024752,-0.036245,0.044592,0.044816,0.063308,...,-0.068509,-0.089697,-0.072361,0.01297,0.022103,0.031734,0.172332,0.011926,-0.00238,0.033484


In [32]:
df_users_with_descriptions_and_embeddings.to_csv(os.path.join(DATA_DIR, "users_with_descriptions_and_embeddings_256.csv"))