# PHASE 1

#### Phase 1 began with installing the OpenAI library and importing the necessary dependencies for our project. We then accessed the OpenAI API key and tested it by prompting a question to verify if GPT-3.5 Turbo could generate a response.

In [69]:
# installing the openai
#!pip install openai
!pip install openai==0.28



In [70]:
import pandas as pd
import openai
import os
import ast
import numpy as np
import pdb

client = openai

In [71]:
# Accessing your api_key
openai.api_key = "openai_api_key"

In [72]:
# Testing the accessed gpt-3.5-turbo for prompting 
question = "what is the capital city of Rwanda?"

response = client.ChatCompletion.create(
    model = "gpt-3.5-turbo",
    messages = [{"role": "system", "content": question}]
)
response.choices[0].message.content

'The capital city of Rwanda is Kigali.'

In [75]:
question_2 = "What temperature should i set my house on vacation so the pipes don't freeze?"
response = client.ChatCompletion.create(
    model = "gpt-3.5-turbo",
    messages = [{"role": "system", "content": question_2}]
)
response.choices[0].message.content

'It is generally recommended to set the thermostat in your house to at least 55-60 degrees Fahrenheit to prevent pipes from freezing during vacation.'

In [76]:
response = client.ChatCompletion.create(
    model = "gpt-3.5-turbo",
    messages = [
        {"role": "system", "content": "You are an assistant who is helping answer questions. Please answer as if you are talking to an 50 years old child"},
        {"role": "user", "content": question_2}
    ]
)
response.choices[0].message.content

"Good question! When you go on vacation, it's a good idea to set your thermostat to at least 55 degrees Fahrenheit to prevent your pipes from freezing. This will help keep your house from getting too cold while you're away."

# PHASE 2

#### In Phase 2, we proceeded by importing the required ISCO dataset and performing data cleaning to prepare it for embedding phase.

In [77]:
# Importing the dataset
occupation = pd.read_csv("C:/Users/user/Downloads/CODE_CLASSIFICATION_PROJECT/isco_data.csv")
occupation.head()

Unnamed: 0,ISCO_CODE,"INZEGO ZO HEJURU Z’UBUTEGETSI, INZEGO Z’UBUTEGETSI ZICIRIRITSE N’IZIRI MUNSI YAZO",1,Managers
0,,"Abayobozi nshingwabikorwa,abayobozi bo hejuru ...",11,"Chief executives, senior officials and legisla..."
1,,Abashyiraho amategeko n’abategetsi bo hejuru,111,Legislators and senior officials
2,1111.0,Abashyiraho amategeko,1111,Legislators
3,1112.0,Abayobozi ba leta bo mu rwego rwo hejuru,1112,Senior government officials
4,1113.0,Abayobozi ba gakondo n’ab’inzego z’ibanze,1113,Traditional chiefs and heads of village


## Data Cleaning

#### To streamline the text processing and embedding phase, we began by cleaning our dataset. This involved dropping unnecessary columns and rows, renaming columns for clarity, and filtering the dataset to retain only the relevant rows.

In [78]:
# Drop the first two columns by their column names
occupation_cleaned = occupation.drop(columns=[occupation.columns[0], occupation.columns[1]])
occupation_cleaned.head()

Unnamed: 0,1,Managers
0,11,"Chief executives, senior officials and legisla..."
1,111,Legislators and senior officials
2,1111,Legislators
3,1112,Senior government officials
4,1113,Traditional chiefs and heads of village


In [79]:
occupation_data = occupation_cleaned

In [80]:
# Rename the first column to 'isco_code'
occupation_data.rename(columns={occupation_data.columns[0]: 'isco_code'}, inplace=True)

In [81]:
# Filter rows where the first column "1" contains exactly 4 digits
filtered_data = occupation_data[occupation_data['isco_code'].astype(str).str.match(r'^\d{4}$')]
filtered_data.head()

Unnamed: 0,isco_code,Managers
2,1111,Legislators
3,1112,Senior government officials
4,1113,Traditional chiefs and heads of village
5,1114,Senior officials of special-interest organizat...
10,1211,Finance managers


In [82]:
isco_df = filtered_data

In [83]:
isco_df.shape

(432, 2)

In [84]:
#renaming the managers column to occupation
isco_df.rename(columns = {'Managers': 'occupation'}, inplace = True)
isco_df.head(1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isco_df.rename(columns = {'Managers': 'occupation'}, inplace = True)


Unnamed: 0,isco_code,occupation
2,1111,Legislators


In [85]:
# changing the isco_code data type to string
isco_df['isco_code'] = isco_df['isco_code'].astype(str)
isco_df.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isco_df['isco_code'] = isco_df['isco_code'].astype(str)


isco_code     object
occupation    object
dtype: object

In [86]:
# joining the text from the columns with additional text in between
isco_df['Occupation_with_code'] = isco_df['occupation'] + ' have the occupational code ' + isco_df['isco_code']
isco_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isco_df['Occupation_with_code'] = isco_df['occupation'] + ' have the occupational code ' + isco_df['isco_code']


Unnamed: 0,isco_code,occupation,Occupation_with_code
2,1111,Legislators,Legislators have the occupational code 1111
3,1112,Senior government officials,Senior government officials have the occupatio...
4,1113,Traditional chiefs and heads of village,Traditional chiefs and heads of village have t...
5,1114,Senior officials of special-interest organizat...,Senior officials of special-interest organizat...
10,1211,Finance managers,Finance managers have the occupational code 1211


# PHASE 3

#### In Phase 3, we advanced to embedding the data to enable efficient text classification and retrieval.

In [88]:
# Function to generate embeddings
def get_embedding(text, model="text-embedding-ada-002"):
    # Replace newlines with spaces to avoid errors
    text = text.replace("\n", " ")
    # Calling the OpenAI API to create embeddings
    response = openai.Embedding.create(input=text, model=model)
    # Return the embedding vector
    return response['data'][0]['embedding']


In [90]:
# Getting the length of the embedding for the first row of `isco_df`
len(get_embedding(isco_df['Occupation_with_code'].iloc[0]))

1536

In [51]:
%%time

# Creating the Embeddings in the database
isco_df['embeddings'] = isco_df['Occupation_with_code'].apply(get_embedding)
isco_df.to_csv('/content/isco_data_with_embeddings.csv', index = False)
isco_df.to_pickle("C:/Users/user/Downloads/CODE_CLASSIFICATION_PROJECT/isco_data_with_embeddings.pkl")

CPU times: total: 0 ns
Wall time: 0 ns


In [52]:
isco_df.head()

Unnamed: 0,isco_code,occupation,Occupation_with_code
2,1111,Legislators,Legislators have the occupational code 1111
3,1112,Senior government officials,Senior government officials have the occupatio...
4,1113,Traditional chiefs and heads of village,Traditional chiefs and heads of village have t...
5,1114,Senior officials of special-interest organizat...,Senior officials of special-interest organizat...
10,1211,Finance managers,Finance managers have the occupational code 1211


In [61]:
%%time



CPU times: total: 15.6 ms
Wall time: 3 ms


In [91]:
%%time
isco_df_2 = pd.read_pickle("C:/Users/user/Downloads/CODE_CLASSIFICATION_PROJECT/isco_data_with_embeddings_2.pkl")
isco_df_2.head()

CPU times: total: 15.6 ms
Wall time: 172 ms


Unnamed: 0,isco_code,occupation,Occupation_with_code,embeddings
2,1111,Legislators,Legislators have the occupational code 1111,"[-0.005412097088992596, 0.005134553648531437, ..."
3,1112,Senior government officials,Senior government officials have the occupatio...,"[0.0018112791003659368, -0.00980456918478012, ..."
4,1113,Traditional chiefs and heads of village,Traditional chiefs and heads of village have t...,"[-0.0007852965500205755, 0.005625859834253788,..."
5,1114,Senior officials of special-interest organizat...,Senior officials of special-interest organizat...,"[-0.014540646225214005, -0.015147055499255657,..."
10,1211,Finance managers,Finance managers have the occupational code 1211,"[-0.0036192494444549084, -0.011559505946934223..."


# PHASE 4

**Building a RAG System**

In [92]:
# Step 1: Function to generate embeddings
def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    response = client.Embedding.create(input=text, model=model)
    return response['data'][0]['embedding']

In [93]:
# Step 2: Function to calculate similarity (dot product)
def calculate_similarity(query_embedding, db_embeddings):
    similarities = np.dot(db_embeddings, query_embedding)
    return similarities

In [94]:
# Step 3: Query function to find and retrieve the top result
def query_system(question, df, model="text-embedding-ada-002"):
    # Generate embedding for the query
    query_embedding = get_embedding(question, model=model)

    # Calculate similarities
    db_embeddings = np.vstack(df['embeddings'])  # Convert list of embeddings to matrix
    similarities = calculate_similarity(query_embedding, db_embeddings)

    # Retrieve the most similar row
    top_idx = np.argmax(similarities)
    retrieved_row = df.iloc[top_idx]

    return retrieved_row

In [95]:
# Step 4: Generate response using GPT-3.5 Turbo
def generate_response(question, retrieved_row):
    # Format the context
    context = f"The closest match is: {retrieved_row['Occupation_with_code']}."

    # Use GPT-3.5 Turbo to generate a response
    response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {
            "role": "system",
            "content": (
                "You are a helpful assistant trained to classify job descriptions and provide occupation codes based on context. "
                "Always respond with: 'The closest match for [query] is the occupational code [code] for [occupation description].'"
            ),
        },
        {
            "role": "user",
            "content": f"Based on this context: {context}, answer the query: {question}"
        }
    ]
)


    return response['choices'][0]['message']['content']

In [68]:
# Input query
query = "I am a servant at hotel fatima"

# Retrieve the most relevant row
retrieved_row = query_system(query, isco_df_2)

# Generate the response
response = generate_response(query, retrieved_row)
print(response)

The closest match for "I am a servant at hotel fatima" is the occupational code 4319 for Other Hospitality Workers.


In [96]:
query = "This person works as a waiter at restaurant?"

# Retrieve the most relevant row
retrieved_row = query_system(query, isco_df_2)

# Generate the response
response = generate_response(query, retrieved_row)
print(response)

The closest match for this person is the occupational code 5131 for Waiters.


In [55]:
query = "This person works as a servant at restaurant?"

# Retrieve the most relevant row
retrieved_row = query_system(query, isco_df_2)

# Generate the response
response = generate_response(query, retrieved_row)
print(response)

The closest match for "This person works as a servant at restaurant?" is the occupational code 5131 for Waiters.


In [None]:
### This work is classified for It