In [76]:
# Welcome to your new notebook
# Type here in the cell editor to add code!
# Thank you

StatementMeta(, 0cb6d1d8-b39f-4d7b-b887-3dc665bff210, 78, Finished, Available)

## Evaluating PR Articles in AI Search with OpenAI GPT 

In [1]:
pip install python-dotenv

StatementMeta(, 8dc0afdd-2630-42eb-bbcd-38cb1ba63b85, 5, Finished, Available)

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
[33mDEPRECATION: notebookutils 3.4.1-20240110.4 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of notebookutils or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mInstalling collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
from pyspark.sql import Row, SparkSession
from dotenv import dotenv_values
import openai
from openai import OpenAI
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, FloatType
import json 
import requests

StatementMeta(, 8dc0afdd-2630-42eb-bbcd-38cb1ba63b85, 6, Finished, Available)

In [3]:
config = dotenv_values('/lakehouse/default/Files/Credentials.env')

StatementMeta(, 8dc0afdd-2630-42eb-bbcd-38cb1ba63b85, 7, Finished, Available)

In [4]:
Ai_search_key = config['Ai_search_key']
Ai_search_location = config['Ai_search_location']
Ai_search_endpoint = config['Ai_search_endpoint']
Ai_search_index = 'fabrichackathonindex'
Ai_search_name = 'fabric-hackathon'

translator_key = config['translator_api_key']
translator_location = config['translator_region']

openai_key = config['openai_api_key']
openai_deployment_name = "gpt-3.5-turbo"
openai_url = config['open_ai_endpoint']


StatementMeta(, 8dc0afdd-2630-42eb-bbcd-38cb1ba63b85, 8, Finished, Available)

In [5]:
df = spark.sql("SELECT * FROM CanadianPRScores.CanadaPRScores LIMIT 1000")
display(df)

StatementMeta(, 8dc0afdd-2630-42eb-bbcd-38cb1ba63b85, 9, Finished, Available)

SynapseWidget(Synapse.DataFrame, 7343001f-8989-48ff-a503-71226c95559b)

In [6]:
unique_rounds = df.groupBy('round_type').count().collect()
total_rounds = df.count()
rounds_count = [[row['round_type'], row['count']] for row in unique_rounds]
rounds_count.append(['total rounds', total_rounds])
for row in rounds_count:
    print(f'{row[0]} : {row[1]}')

StatementMeta(, 8dc0afdd-2630-42eb-bbcd-38cb1ba63b85, 10, Finished, Available)

French language proficiency (2024-1) : 1
Agriculture and agri-food occupations (2023-1) : 3
Healthcare occupations (2023-1) : 4
French language proficiency (2023-1) : 7
Transport occupations (2023-1) : 2
Trade occupations (2023-1) : 2
STEM occupations (2023-1) : 2
Federal Skilled Worker : 1
Canadian Experience Class : 28
Federal Skilled Trades : 7
Provincial Nominee Program : 57
General : 174
total rounds : 288


In [41]:
def get_embeddings(text):
    client = OpenAI(api_key = openai_key)

    response = client.embeddings.create(
        input = text,
        model= "text-embedding-ada-002"
    )
    embeddings = response.data[0].embedding  
    return embeddings

StatementMeta(, 8dc0afdd-2630-42eb-bbcd-38cb1ba63b85, 45, Finished, Available)

In [42]:
def gen_category_embedding(RoundType):
    """Generates embedding for each roundtype."""

    df_RoundType = spark.createDataFrame([(RoundType, 1)], ["RoundType", "dummy"])
    # Define a UDF (User Defined Function)
    get_embeddings_udf = udf(get_embeddings, ArrayType(FloatType()))
    # Apply the UDF to the 'chunk' column
    df_cat_embeddings = df_RoundType.withColumn('embeddings', get_embeddings_udf(df_RoundType['RoundType']))
    row = df_cat_embeddings.collect()[0]
    category_embedding = row.embeddings
    return category_embedding


StatementMeta(, 8dc0afdd-2630-42eb-bbcd-38cb1ba63b85, 46, Finished, Available)

In [43]:
print(gen_category_embedding("French language proficiency (2024-1)"))

StatementMeta(, 8dc0afdd-2630-42eb-bbcd-38cb1ba63b85, 47, Finished, Available)

[-0.0021424912847578526, 0.00845283642411232, 0.009884417057037354, -0.024154677987098694, -0.02373821847140789, 0.017439261078834534, -0.025521187111735344, 0.00180086400359869, -0.020133236423134804, -0.011289969086647034, 0.03100023977458477, 0.017113901674747467, 0.004408619366586208, -0.017348160967230797, -0.0038490011356770992, 0.010144704952836037, 0.021291516721248627, -0.03245784714818001, 0.02479238249361515, -0.02170797623693943, -0.01585150696337223, -0.007463743444532156, -0.007860681973397732, -0.00919465534389019, 0.00204488355666399, -0.01471925713121891, 0.004070245660841465, -0.02857956476509571, 0.010606714524328709, -0.012077338993549347, 0.03230167552828789, 0.007613408844918013, -0.015942608937621117, -0.022124435752630234, 0.008153505623340607, -0.032119475305080414, -0.020133236423134804, -0.00800384022295475, 0.02522185631096363, -0.0026337839663028717, 0.021109314635396004, 0.03292636573314667, 0.023998506367206573, -0.010554657317698002, -0.00592804746702313

In [44]:
def retrieve_top_chunks(k, category, category_embedding):
    """Retrieve the top K entries from Azure AI Search using hybrid search."""
    url = f"https://{Ai_search_name}.search.windows.net/indexes/{Ai_search_index}/docs/search?api-version=2023-11-01"

    payload = json.dumps({
        "search": category,
        "top": k,
        "vectorQueries": [
            {
                "vector": category_embedding,
                "k": k,
                "fields": "contentVector",
                "kind": "vector"
            }
        ]
    })

    headers = {
        "Content-Type": "application/json",
        "api-key": Ai_search_key,
    }

    response = requests.request("POST", url, headers=headers, data=payload)
    output = json.loads(response.text)
    return output



StatementMeta(, 8dc0afdd-2630-42eb-bbcd-38cb1ba63b85, 48, Finished, Available)

In [45]:
def get_context(category, retrieved_k = 5):
    # Generate embeddings for the categories
    category_embedding = gen_category_embedding(category)

    # Retrieve the top K entries
    output = retrieve_top_chunks(retrieved_k, category, category_embedding)

    # concatenate the content of the retrieved documents
    context = [chunk["Article_body"] for chunk in output["value"]]

    return context


StatementMeta(, 8dc0afdd-2630-42eb-bbcd-38cb1ba63b85, 49, Finished, Available)

In [46]:
context = get_context("French language proficiency (2024-1)", retrieved_k = 5)

StatementMeta(, 8dc0afdd-2630-42eb-bbcd-38cb1ba63b85, 50, Finished, Available)

In [47]:
print(context)

StatementMeta(, 8dc0afdd-2630-42eb-bbcd-38cb1ba63b85, 51, Finished, Available)

['Afghan resettlement programs in the world, and our work continues. The Levels Plan takes into account extensive engagement with provincial and territorial representatives, as well as public opinion research and consultations with various stakeholders, including businesses, community organizations, and educational institutions. The 2024 target of 485,000 new immigrants to Canada amounts to 1.2% of Canada’s current population. Canadians across the country can see how newcomers are benefiting local communities through Immigration, Refugees and Citizenship Canada’s Immigration Matters campaign. Under the Canada-Quebec Accord, Quebec establishes its own immigration levels. Related products Infographic – Immigration: a path to a stronger Canada 2024-2026 Immigration Levels Plan and Supplementary Information 2023 Annual Report to Parliament on Immigration 2023 Consultations on Immigration Levels – Final Report Canada-Quebec Accord An Immigration System for Canada’s Future Contacts Contacts 

## Create an evaluation model
feed the retrieved documents into the llm and have the llm evaluate the contents of the documents based on the following metrics . It would provide scores for each metrics evaluated 

1. Economic need: Is there a labour market or demographic need for this class
2. Sentiment: what is the current sentiment of the Canadian government for this class
3. Priority: Is this class a current priority for the Canadian government
4. Provincial need: Does this class address a provincial need
5. Recent: Has this class been mentioned recently  or created recently to fill up a need or gap
6. Goal of Canadian government: Is this stream a current goal for the Canadian government 
7. Frequency: How frequent is this stream called 


In [57]:


def prompt_engineering(context, class_frequency):
    total_sum = rounds_count[-1][-1]
    article = context
    chat_context_prompt = f"""

    article: {article} 

    Frquency of class: {class_frequency} out of {total_sum}

    You are an evaluator of Canadian permanent residence economic classes related to immgration in Canada.
    You are to use the articles above on immgration policies in Canada to evaluate the permanent resident economic class.

    Be very consistent with the format of the responses generated.

    Your final output should state the following for all evaluation metrics: Metric, Score and Explanation.

    Write each evaluation metric, the rating score and a explanation for the rating.

    Be very consistent with the format of the responses generated.

    Below are the evaluation criterias for the Canadian permanent residence economic classes.
    Use these metrics to determine how likely the class is to be selected by the Canadian government based on the contents of the articles you received. 
    Assign the scores for each metric.

    Write an explanation of 100 words for the reason of assigning that score to each metric in the Explanation column. 
    Use quotes from the articles you received to explain your rating scores for each evalutation meteric. 

    Use the metrics below
    1) Economic need: Is there a labour market or demographic need for this class? Rate Economic need out of 10. Be critical

    2) Canadan Government Sentiment: Assess the current sentiment of the Canadian government for this class? Rate Sentiment out of 10.Be critical

    3) Priority: Use both the article and frequency of class to assess priority. Is this class a current priority for the Canadian government? Rate Priority out of 10. Be critical

    4) Provincial need: Assess if this class is a Canadian provincial need. Does this class address a provincial need? Rate Provincial need out of 10. Be critical

    5) Recent: Assess how recent this class is. Has this class been mentioned recently or was it created recently to fill up a need or gap? Rate Recent out of 10. Be critical

    6) Goal of Canadian government: Use both the article and frequency of class to assess Canadian government's goal. Rate Goal of Canadian Government out of 10. Be critical

    7) Frequency: Assess how frequently this class is called given the frequency of the class is {class_frequency} and total number frequency of all the classes is {total_sum}?  How frequent is this class called. Rate Frequency out of 10. 

    Before you assign rating scores to each metrics ensure the rating score aligns with the content of the article

    Do not mention the article in your explanation. Make only quotations from the article.

    """
    one_shot_example = ''' 

    Metric: Economic need
    Score: 8
    Explanation: xxx

    Metric: Canadian Government Sentiment
    Score: 6
    Explanation: xxxx

    Metric: Priority
    Score: 7
    Explanation: xxxx 

    Metric: Provincial need
    Score: 9
    Explanation: xxxx

    Metric: Recent
    Score: 8
    Explanation: xxxx

    Metric: Goal of Canadian government
    Score: 7
    Explanation: xxxx 

    Metric: Frequency
    Score: 3
    Explanation: xxxx
    '''
    return chat_context_prompt, one_shot_example


StatementMeta(, 8dc0afdd-2630-42eb-bbcd-38cb1ba63b85, 61, Finished, Available)

In [58]:
client = OpenAI(api_key=openai_key)

def evaluator(text):
    MESSAGES = [
    {"role": "system", "content": chat_context_prompt},
    {"role": "user", "content": text},
    {"role": "assistant","content": one_shot_example },
    ]
    MESSAGES.append({"role": "user", "content": text})

    completion = client.chat.completions.create(model="gpt-4", messages=MESSAGES,temperature=0.9)
    return completion.choices[0].message.content




StatementMeta(, 8dc0afdd-2630-42eb-bbcd-38cb1ba63b85, 62, Finished, Available)

In [81]:
all_analysis = []
for i in range(len(rounds_count)-1):
    context = get_context(rounds_count[i][0], retrieved_k = 5)

    class_frequency = rounds_count[i][-1]

    context = context

    chat_context_prompt, one_shot_example = prompt_engineering(context, class_frequency)

    summary = evaluator(rounds_count[i][0])
    
    all_analysis.append([rounds_count[i][0], summary])


StatementMeta(, 8dc0afdd-2630-42eb-bbcd-38cb1ba63b85, 85, Finished, Available)

In [91]:
#Write analysis into list
dataframeList = []
for item in all_analysis:
    # Split the string into blocks
    summary = str(item[1])
    blocks = summary.strip().split("\n\n")

    # Parse each block into a dictionary
    data = {"Economic Class": item[0]}
    for block in blocks:
        lines = block.split("\n")
        metric = lines[0].split(": ")[1]
        score = int(lines[1].split(": ")[1])
        explanation = lines[2].split(": ")[1]
        data[f"{metric} Score"] = score
        data[f"{metric} Explanation"] = explanation
    dataframeList.append(data)



StatementMeta(, 8dc0afdd-2630-42eb-bbcd-38cb1ba63b85, 95, Finished, Available)

In [118]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Initialize Spark Session
spark = SparkSession.builder.getOrCreate()

# Define the schema
schema = StructType([
    StructField("Canadian_Government_Sentiment_Explanation", StringType(), True),
    StructField("Canadian_Government_Sentiment_Score", IntegerType(), True),
    StructField("Round_Type", StringType(), True),
    StructField("Economic_need_Explanation", StringType(), True),
    StructField("Economic_need_Score", IntegerType(), True),
    StructField("Frequency_Explanation", StringType(), True),
    StructField("Frequency_Score", IntegerType(), True),
    StructField("Goal_of_Canadian_government_Explanation", StringType(), True),
    StructField("Goal_of_Canadian_government_Score", IntegerType(), True),
    StructField("Priority_Explanation", StringType(), True),
    StructField("Priority_Score", IntegerType(), True),
    StructField("Provincial_need_Explanation", StringType(), True),
    StructField("Provincial_need_Score", IntegerType(), True),
    StructField("Recent_Explanation", StringType(), True),
    StructField("Recent_Score", IntegerType(), True),

])

# Create an empty DataFrame
df = spark.createDataFrame([], schema)


StatementMeta(, 8dc0afdd-2630-42eb-bbcd-38cb1ba63b85, 122, Finished, Available)

In [119]:
for item in dataframeList:
    data = item
    new_row = spark.createDataFrame([data])
    df = df.union(new_row)

StatementMeta(, 8dc0afdd-2630-42eb-bbcd-38cb1ba63b85, 123, Finished, Available)

In [120]:
display(df)

StatementMeta(, 8dc0afdd-2630-42eb-bbcd-38cb1ba63b85, 124, Finished, Available)

SynapseWidget(Synapse.DataFrame, 32b9d153-6640-49da-8bd6-88fdd31d5401)

In [121]:
delta_table_path = "Tables/CanadaPRQualitativeAnalysis"  
df.write.format("delta").mode("overwrite").save(delta_table_path)

StatementMeta(, 8dc0afdd-2630-42eb-bbcd-38cb1ba63b85, 125, Finished, Available)