# Dependências

In [2]:
import os
import sys
import pandas as pd
import numpy as np

from langchain_openai import OpenAIEmbeddings

from utils import load_data

from dotenv import load_dotenv
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Tomando os usuários

In [3]:
DATA_DIR = "data"
users = load_data(DATA_DIR, "users.dat")

In [4]:
users

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [5]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserID      6040 non-null   int64 
 1   Gender      6040 non-null   object
 2   Age         6040 non-null   int64 
 3   Occupation  6040 non-null   int64 
 4   Zip-code    6040 non-null   object
dtypes: int64(3), object(2)
memory usage: 236.1+ KB


In [6]:
user_num = 6040

### Dicionários de características
As informações foram extraídas da descrição fornecida pelo próprio grupo MovieLens. Está disponível em <a href="https://files.grouplens.org/datasets/movielens/ml-1m-README.txt">README - MovieLens 1M Dataset</a>.

In [7]:
gender_user_dict = {
    "M": "male",
    "F": "female"
}

In [8]:
age_user_dict = {
    1: "under 18",
    18: "18-24",
    25: "25-34",
    35: "35-44",
    45: "45-49",
    50: "50-55",
    56: "56+"
}

In [9]:
occupation_user_dict = {
    0:  "other or not specified",
	1:  "academic/educator",
	2:  "artist",
	3:  "clerical/admin",
	4:  "college/grad student",
	5:  "customer service",
	6:  "doctor/health care",
	7:  "executive/managerial",
	8:  "farmer",
	9:  "homemaker",
	10:  "K-12 student",
	11:  "lawyer",
	12:  "programmer",
	13:  "retired",
	14:  "sales/marketing",
	15:  "scientist",
	16:  "self-employed",
	17:  "technician/engineer",
	18:  "tradesman/craftsman",
	19:  "unemployed",
	20:  "writer"
}

## Gerando descrições de cada usuário

In [10]:
user_descriptions = [] # list of user descriptions
for i in range(1, user_num+1): # for each user
    user = users[users["UserID"] == i]
    description = f"This is user ID {i}. Their gender is {gender_user_dict[user['Gender'].values[0]]} and their age falls within the age group of {age_user_dict[user['Age'].values[0]]} years old. Their occupation is as {occupation_user_dict[user['Occupation'].values[0]]} and their Zip code is {user['Zip-code'].values[0]}."
    user_descriptions.append(description)

In [11]:
user_descriptions[:5]

['This is user ID 1. Their gender is female and their age falls within the age group of under 18 years old. Their occupation is as K-12 student and their Zip code is 48067.',
 'This is user ID 2. Their gender is male and their age falls within the age group of 56+ years old. Their occupation is as self-employed and their Zip code is 70072.',
 'This is user ID 3. Their gender is male and their age falls within the age group of 25-34 years old. Their occupation is as scientist and their Zip code is 55117.',
 'This is user ID 4. Their gender is male and their age falls within the age group of 45-49 years old. Their occupation is as executive/managerial and their Zip code is 02460.',
 'This is user ID 5. Their gender is male and their age falls within the age group of 25-34 years old. Their occupation is as writer and their Zip code is 55455.']

In [12]:
df_users_with_descriptions = pd.DataFrame(users["UserID"])
df_users_with_descriptions["Description_User"] = pd.Series(user_descriptions)

In [13]:
df_users_with_descriptions

Unnamed: 0,UserID,Description_User
0,1,This is user ID 1. Their gender is female and ...
1,2,This is user ID 2. Their gender is male and th...
2,3,This is user ID 3. Their gender is male and th...
3,4,This is user ID 4. Their gender is male and th...
4,5,This is user ID 5. Their gender is male and th...
...,...,...
6035,6036,This is user ID 6036. Their gender is female a...
6036,6037,This is user ID 6037. Their gender is female a...
6037,6038,This is user ID 6038. Their gender is female a...
6038,6039,This is user ID 6039. Their gender is female a...


### Embeddings

Aqui, nós usaremos um modelo de embedding da `OpenAI`.

In [14]:
model_embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY, model="text-embedding-3-large")

In [15]:
embeddings_descriptions = model_embedding.embed_documents(user_descriptions)

In [16]:
embeddings_descriptions = np.array(embeddings_descriptions)
embeddings_descriptions

array([[-0.01466561,  0.01815529,  0.00219873, ...,  0.00218384,
        -0.00698309, -0.00733652],
       [-0.01282   ,  0.02362706,  0.00398305, ...,  0.01102834,
        -0.00813028, -0.02171405],
       [ 0.00053483,  0.00966578, -0.00170056, ..., -0.00443754,
         0.00669141, -0.01302072],
       ...,
       [-0.0236368 ,  0.01695714, -0.00064365, ...,  0.00500277,
        -0.00973362, -0.02772269],
       [-0.02795208,  0.0302692 ,  0.00151786, ..., -0.00430794,
         0.00266909, -0.01808233],
       [-0.02683551,  0.03179844,  0.00302754, ..., -0.00322628,
        -0.00218986, -0.02624114]])

In [17]:
embeddings_descriptions.shape

(6040, 3072)

In [66]:
embeddings_descriptions_df = pd.DataFrame(embeddings_descriptions)
embeddings_descriptions_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3062,3063,3064,3065,3066,3067,3068,3069,3070,3071
0,-0.014662,0.018106,0.002178,-0.012393,-0.012535,-0.010481,0.027524,0.010013,0.04636,0.028268,...,0.010348,0.003137,0.002457,-0.019222,0.019401,0.014409,-0.002503,0.002183,-0.006985,-0.007331
1,-0.012858,0.023589,0.003955,-0.035383,0.00569,0.023289,0.037582,0.005869,0.0305,0.051661,...,0.022818,0.00367,-0.007918,-0.005562,0.012001,-0.016935,-0.005936,0.011045,-0.008153,-0.021732
2,0.000537,0.00965,-0.001674,-0.021769,-0.011498,0.008601,0.008734,0.011616,0.044158,0.044188,...,-0.004027,0.013545,-0.002939,-0.013626,0.018636,-0.001604,-0.001838,-0.004459,0.00668,-0.01302
3,-0.025381,0.030772,-0.003485,-0.016098,0.001549,0.014773,0.026653,0.007973,0.019929,0.039192,...,0.01057,0.017127,-0.00368,0.002359,0.010305,0.009745,-0.005834,0.003002,-0.004127,-0.020217
4,-0.015836,0.022,-0.003704,-0.032273,-0.016916,0.024609,0.011112,0.008571,0.017081,0.027459,...,-0.000694,0.009988,-0.006194,-0.009515,0.016181,0.004349,-0.010415,-0.010438,-0.012582,-0.018476


In [67]:
df_users_with_descriptions_and_embeddings = pd.concat([df_users_with_descriptions, embeddings_descriptions_df], axis=1)
df_users_with_descriptions_and_embeddings.head(5)

Unnamed: 0,UserID,Description_User,0,1,2,3,4,5,6,7,...,3062,3063,3064,3065,3066,3067,3068,3069,3070,3071
0,1,This is user ID 1. Their gender is female and ...,-0.014662,0.018106,0.002178,-0.012393,-0.012535,-0.010481,0.027524,0.010013,...,0.010348,0.003137,0.002457,-0.019222,0.019401,0.014409,-0.002503,0.002183,-0.006985,-0.007331
1,2,This is user ID 2. Their gender is male and th...,-0.012858,0.023589,0.003955,-0.035383,0.00569,0.023289,0.037582,0.005869,...,0.022818,0.00367,-0.007918,-0.005562,0.012001,-0.016935,-0.005936,0.011045,-0.008153,-0.021732
2,3,This is user ID 3. Their gender is male and th...,0.000537,0.00965,-0.001674,-0.021769,-0.011498,0.008601,0.008734,0.011616,...,-0.004027,0.013545,-0.002939,-0.013626,0.018636,-0.001604,-0.001838,-0.004459,0.00668,-0.01302
3,4,This is user ID 4. Their gender is male and th...,-0.025381,0.030772,-0.003485,-0.016098,0.001549,0.014773,0.026653,0.007973,...,0.01057,0.017127,-0.00368,0.002359,0.010305,0.009745,-0.005834,0.003002,-0.004127,-0.020217
4,5,This is user ID 5. Their gender is male and th...,-0.015836,0.022,-0.003704,-0.032273,-0.016916,0.024609,0.011112,0.008571,...,-0.000694,0.009988,-0.006194,-0.009515,0.016181,0.004349,-0.010415,-0.010438,-0.012582,-0.018476


In [73]:
df_users_with_descriptions_and_embeddings.to_csv(os.path.join(DATA_DIR, "users_with_descriptions_and_embeddings.csv"))