In [2]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import datasets
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizerFast, LlamaForCausalLM, LlamaTokenizer
from sklearn.metrics import classification_report
from peft import PeftModel

from eval_utils import *

df_test = pd.read_excel("data/multipleChoice.xlsx" )
# only keep first 50 rows
df_test


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/sebastianquintero/Library/CloudStorage/OneDrive-QuinteroOrthodontics/MIT/MEng/rag-thesis/.venv/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/sebastianquintero/Library/CloudStorage/OneDrive-QuinteroOrthodontics/MIT/MEng/rag-thesis/.venv/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    ap

Unnamed: 0,Question,OptionA,OptionB,OptionC,OptionD,Answer,Feedback,theory,numerical,grouping,fill_in_blank,true_false
0,One of the reasons protectionists and governme...,quotas generate more revenue for the governmen...,quotas ensure that the quantities of imports a...,quotas create less market distortions than tar...,quotas give less power to politicians than tar...,B,,1,0,0,0,0
1,"In the case of a small country, the effects of...",the government allocates licenses for free to ...,the government auctions off import licenses to...,the government allocates licenses to importers...,the government allocates import licenses direc...,B,,0,1,0,0,0
2,A small country imports T-shirts. With free tr...,gain $5 million.,lose $5 million.,gain $25 million.,gain $30 million.,C,,0,1,0,0,0
3,A small country imports T-shirts. With free tr...,gain $7 million.,lose $7 million.,lose $70 million.,lose $77 million.,D,,0,1,0,0,0
4,A small country imports T-shirts. With free tr...,$30 million,$40 million,$70 million,$240 million,B,,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
218,Specific factors are more likely to favor trad...,True,False,,,A,Workers can move away from the sector whose pr...,0,0,0,0,1
219,Factor prices are more likely to be equalized ...,True,False,,,B,"When countries are not completely specialized,...",0,0,0,0,1
220,The European Common Agricultural Policy is a b...,True,False,,,B,The European Common Agricultural Policy entail...,0,0,0,0,1
221,Immigration necessarily lowers wages.,True,False,,,B,"For example, when factor price equalization ho...",0,0,0,0,1


In [3]:
# Sum up the dummy variables to get counts for each category
category_counts = df_test[['theory', 'numerical', 'grouping', 'fill_in_blank', 'true_false']].sum()

# Calculate percentages
total_questions = len(df_test)
category_percentages = (category_counts / total_questions * 100).round(2)

# Create a summary DataFrame
summary_df = pd.DataFrame({
    'Count': category_counts,
    'Percentage (%)': category_percentages
})

print("Question Type Distribution:")
print("==========================")
print(summary_df)
print(f"\nTotal number of questions: {total_questions}")

Question Type Distribution:
               Count  Percentage (%)
theory            36           16.14
numerical         11            4.93
grouping           5            2.24
fill_in_blank     82           36.77
true_false        89           39.91

Total number of questions: 223


In [8]:
prompt = "Answer the following multiple choice question with ONLY a single letter (A, B, C, or D). Do not include any other text, punctuation, or explanation - just the letter.\n\
Question: {}\n\
A) {}\n\
B) {}\n\
C) {}\n\
D) {}\n\
Answer:"

prompt_TF = "Answer the following true/false question. Respond only with the letter A if the statement is true, or B if the statement is false. Do not include any other text, punctuation, or explanation - just the letter.\n\
Question: {}\n\
A) {}\n\
B) {}\n\
Answer:"

for MODEL_NAME in ["gpt-35-turbo"]:
    y_pred = []
    y_true = []
    
    for row in tqdm(df_test.iterrows(), total=len(df_test)):
        time.sleep(1)
        question = row[1]["Question"]
        choice_a = row[1]["OptionA"]
        choice_b = row[1]["OptionB"]
        choice_c = row[1]["OptionC"]
        choice_d = row[1]["OptionD"]
        label = row[1]["Answer"].strip().upper()  # Clean the true label
        
        if MODEL_NAME == "gpt-35-turbo":
            if row[1]["true_false"]:
                prediction = prompt_chatgpt_with_backoff(prompt_TF.format(
                    question, choice_a, choice_b))
            else:
                prediction = prompt_chatgpt_with_backoff(prompt.format(
                    question, choice_a, choice_b, choice_c, choice_d))
        
        # Clean and extract the predicted answer letter more carefully
        pred_letter = prediction.strip().upper()
        # If the prediction contains more than just the letter, take first word
        if len(pred_letter) > 1:
            pred_letter = pred_letter.split()[0]
        
        # Validate prediction format
        if pred_letter not in ['A', 'B', 'C', 'D']:
            print(f"Warning: Invalid prediction format: '{prediction}' for question: {question}")
            pred_letter = 'INVALID'
        
        y_pred.append(pred_letter)
        y_true.append(label)

        # Print debugging info for all predictions
        if pred_letter != label:
            print("\nIncorrect Prediction:")
            print(f"Question: {question}")
            print(f"Choices:")
            print(f"A) {choice_a}")
            print(f"B) {choice_b}")
            if not row[1]["true_false"]:
                print(f"C) {choice_c}")
                print(f"D) {choice_d}")
            print(f"Raw model response: '{prediction}'")
            print(f"Processed prediction: '{pred_letter}'")
            print(f"Correct answer: '{label}'")
            print("=========================================================")

    # Calculate overall accuracy
    total_questions = len(y_true)
    correct_answers = sum(1 for pred, true in zip(y_pred, y_true) if pred == true)
    overall_accuracy = correct_answers / total_questions

    # Print detailed matching information
    print("\nDetailed Answer Matching:")
    print("------------------------")
    mismatches = [(i, pred, true) for i, (pred, true) in enumerate(zip(y_pred, y_true)) if pred != true]
    print(f"Found {len(mismatches)} mismatches out of {total_questions} questions")
    
    # Calculate per-category accuracies
    categories = ['theory', 'numerical', 'grouping', 'fill_in_blank', 'true_false']
    category_results = {}
    
    for category in categories:
        # Get questions belonging to this category
        category_indices = df_test[df_test[category] == 1].index
        
        if len(category_indices) > 0:
            category_correct = sum(1 for i in category_indices if y_pred[i] == y_true[i])
            category_accuracy = category_correct / len(category_indices)
            category_results[category] = {
                'accuracy': category_accuracy,
                'correct': category_correct,
                'total': len(category_indices)
            }

    # Print results
    print(f"\nResults for {MODEL_NAME}")
    print("=" * 50)
    print(f"Overall Accuracy: {overall_accuracy:.4f} ({correct_answers}/{total_questions} correct)")
    print("\nPer-Category Performance:")
    print("-" * 30)
    
    for category, results in category_results.items():
        print(f"{category.replace('_', ' ').title()}:")
        print(f"  Accuracy: {results['accuracy']:.4f}")
        print(f"  Correct: {results['correct']}/{results['total']}")
        print()

    # Save results to file
    with open("results/LLM_results.txt", "a") as f:
        f.write(f"\nResults for {MODEL_NAME}\n")
        f.write("=" * 50 + "\n")
        f.write(f"Overall Accuracy: {overall_accuracy:.4f} ({correct_answers}/{total_questions} correct)\n")
        f.write("\nPer-Category Performance:\n")
        f.write("-" * 30 + "\n")
        
        for category, results in category_results.items():
            f.write(f"{category.replace('_', ' ').title()}:\n")
            f.write(f"  Accuracy: {results['accuracy']:.4f}\n")
            f.write(f"  Correct: {results['correct']}/{results['total']}\n\n")
        f.write("\n")

  1%|          | 2/223 [00:11<24:38,  6.69s/it]


Incorrect Prediction:
Question: In the case of a small country, the effects of a quota and a tariff are (almost) identical if:
Choices:
A) the government allocates licenses for free to importers using a rule or process that involves (almost) no resource cost.
B) the government auctions off import licenses to the highest bidder.
C) the government allocates licenses to importers through application and selection procedures that require the use of substantial resources.
D) the government allocates import licenses directly to the public using a free lottery system.
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


  2%|▏         | 4/223 [00:32<33:28,  9.17s/it]


Incorrect Prediction:
Question: A small country imports T-shirts. With free trade at a world price of $10, domestic production is 10 million T-shirts and domestic consumption is 42 million T-shirts. The country's government now decides to impose a quota to limit T-shirt imports to 20 million per year. With the import quota in place, the domestic price rises to $12 per T-shirt and domestic production rises to 15 million T-shirts per year. The quota on T-shirts causes domestic consumers to:
Choices:
A) gain $7 million.
B) lose $7 million.
C) lose $70 million.
D) lose $77 million.
Raw model response: 'B'
Processed prediction: 'B'
Correct answer: 'D'


  2%|▏         | 5/223 [00:43<36:15,  9.98s/it]

Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-12-01-preview have exceeded call rate limit of your current AIServices S0 pricing tier. Please retry after 1 second. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit. For Free Account customers, upgrade to Pay as you Go here: https://aka.ms/429TrialUpgrade.'}}


  3%|▎         | 6/223 [01:28<1:17:58, 21.56s/it]


Incorrect Prediction:
Question: A small country imports T-shirts. With free trade at a world price of $10, domestic production is 10 million T-shirts and domestic consumption is 42 million T-shirts. The country's government now decides to impose a quota to limit T-shirt imports to 20 million per year. With the import quota in place, the domestic price rises to $12 per T-shirt and domestic production rises to 15 million T-shirts per year. If the government auctions the import licenses, the national well-being will ________ by:
Choices:
A) increase; $40 million.
B) decrease; $12 million.
C) increase; $65 million.
D) decrease; $5 million.
Raw model response: 'C'
Processed prediction: 'C'
Correct answer: 'B'


  4%|▎         | 8/223 [01:48<55:36, 15.52s/it]  


Incorrect Prediction:
Question: When a large country imposes an import quota:
Choices:
A) the product's world price rises.
B) the product's world price falls.
C) the product's domestic price falls.
D) domestic production of the product falls.
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


  4%|▍         | 10/223 [02:11<47:27, 13.37s/it]


Incorrect Prediction:
Question: With a voluntary export restraint (VER), the markup revenue (economic rent) created for the quantitatively limited trade is collected by:
Choices:
A) the government of the importing county.
B)  the consumers in the importing country.
C) the producers in the importing country.
D) the exporting firms in the foreign countries
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'D'


  6%|▌         | 13/223 [02:42<40:02, 11.44s/it]


Incorrect Prediction:
Question: The World Trade Organization has rules that try to limit the use of tariffs but not the use of nontariff barriers (NTBs).
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


  7%|▋         | 15/223 [03:05<39:20, 11.35s/it]


Incorrect Prediction:
Question: Unlike a tariff, a quota does not cause either a production effect or a consumption effect.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 13%|█▎        | 28/223 [05:28<36:45, 11.31s/it]


Incorrect Prediction:
Question:  If the government's goal is to induce early production, even when the new firms are not cost competitive by world standards, a barrier to the import of the product produced by these firms would be an ideal policy.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 13%|█▎        | 30/223 [05:49<34:59, 10.88s/it]


Incorrect Prediction:
Question: If the domestic firms do not supply anything at the world price, the government should lower the barriers to importing the product to spur domestic production.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 14%|█▍        | 31/223 [05:59<34:20, 10.73s/it]


Incorrect Prediction:
Question: In a first-best world, imports would not cause import-competing firms to go out of business and workers in these industries to lose their jobs.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 15%|█▍        | 33/223 [06:20<33:22, 10.54s/it]


Incorrect Prediction:
Question: The most efficient policy to maintain production in import-competing industries is to impose barriers on imports.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 15%|█▌        | 34/223 [06:30<33:01, 10.49s/it]


Incorrect Prediction:
Question: In a first-best world, if rising import competition is driving domestic producers out of business, the government must intervene to protect the domestic firms.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 16%|█▌        | 35/223 [06:41<32:45, 10.46s/it]


Incorrect Prediction:
Question: In the United States, trade adjustment assistance:
Choices:
A) provides workers who have been displaced from import-competing firms with additional months of unemployment compensation.
B) provides subsidies to firms who produce exportable commodities.
C) provides incentives for workers to search for new jobs outside an import-competing industry before they lose their jobs in this industry.
D) is often criticized on the ground that it provides benefits to millions of workers each year who have not actually been affected by increased imports.
Raw model response: 'D'
Processed prediction: 'D'
Correct answer: 'A'


 19%|█▉        | 42/223 [07:59<33:50, 11.22s/it]


Incorrect Prediction:
Question: A tariff on a good when the world price is lower than the domestic price leads to:
Choices:
A) tariff revenues that will be lower than under free trade.
B) domestic imports that will be higher than under free trade.
C) lower domestic consumption of the good than under free trade.
D) lower domestic production of the good than under free trade.
Raw model response: 'B'
Processed prediction: 'B'
Correct answer: 'C'


 22%|██▏       | 49/223 [09:15<31:40, 10.92s/it]


Incorrect Prediction:
Question: Which of the following statements is TRUE?
I. If the United States bans the importation of bananas, consumer surplus will decrease.
II. If the United States bans the importation of bananas, producer surplus will decrease.
III. If the United States bans the importation of bananas, it will produce bananas at a
cost exceeding their world purchase price.
Choices:
A) I, II, and III
B) I and II only
C) I and III only
D) II and III only
Raw model response: 'B'
Processed prediction: 'B'
Correct answer: 'C'


 23%|██▎       | 51/223 [09:36<31:18, 10.92s/it]


Incorrect Prediction:
Question: Suppose that a tariff increases domestic production of a good from 25 million units to 75 million units and raises the domestic price by $1.50. Assuming a linear domestic supply curve and a perfectly elastic world supply curve, what is the value of the resources wasted by increased domestic production?
Choices:
A) $37.5 million
B) $50 million
C) $75 million
D) $150 million
Raw model response: 'C'
Processed prediction: 'C'
Correct answer: 'A'


 23%|██▎       | 52/223 [09:47<30:47, 10.80s/it]


Incorrect Prediction:
Question: If a tariff decreases domestic consumption of a good from 230 million units to 150 million units and raises the domestic price by $1.50, given a linear domestic demand curve and a perfectly elastic world supply curve, what is the value of the unexploited gains from trade caused by decreased domestic consumption?
Choices:
A) $45 million
B) $60 million
C) $80 million
D) $120 million
Raw model response: 'C'
Processed prediction: 'C'
Correct answer: 'B'


 27%|██▋       | 61/223 [11:26<30:54, 11.45s/it]


Incorrect Prediction:
Question: Which statement provides an explanation for tariffs decreasing market efficiency?
Choices:
A) The supply of goods is not purchased by the buyers with the highest willingness to pay.
B) The supply of goods is not produced by the lowest-cost suppliers.
C) Prices are not equal to the equilibrium price.
D) Deadweight loss is equal to zero
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 33%|███▎      | 73/223 [13:37<27:33, 11.03s/it]


Incorrect Prediction:
Question: International Trade does the following:
Choices:
A) decreases the number of jobs.
B) increases the number of jobs.
C) moves jobs from export industries to import-competing industries.
D) moves jobs from import-competing industries to export industries.
Raw model response: 'B'
Processed prediction: 'B'
Correct answer: 'D'


 36%|███▌      | 80/223 [14:56<26:55, 11.30s/it]


Incorrect Prediction:
Question: History has shown that one of the most effective tools against child labor is:
Choices:
A) regulations
B) laws.
C) economic growth.
D) quotas.
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'C'


 40%|███▉      | 89/223 [16:33<23:57, 10.73s/it]


Incorrect Prediction:
Question: Consider the following two statements and select the best answer.
I. The national security argument might be a valid argument for trade protection.
II. Industries with spillover effects should be protected from foreign competition.
Choices:
A) I and II are both true.
B) I and II are both false.
C) I is likely to be true, and II is likely to be false.
D) I is likely to be false, and II is likely to be true.
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'C'


 41%|████▏     | 92/223 [17:05<23:05, 10.58s/it]


Incorrect Prediction:
Question: The flu pandemic of 1918 provides an example of:
Choices:
A) a situation for which it makes sense to protect a domestic industry from
international competition.
B) how trade restrictions lead to deaths and suffering.
C) how child labor affects trade flows between countries.
D) strategic trade protectionism.
Raw model response: 'B'
Processed prediction: 'B'
Correct answer: 'A'


 42%|████▏     | 93/223 [17:15<22:46, 10.51s/it]


Incorrect Prediction:
Question: Which statement is TRUE?
Choices:
A) Import restrictions may be the best policy if production in certain industries
generates positive spillover effects to other industries.
B) It is usually easy to identify industries that generate positive spillover effects.
C) It would be more efficient to subsidize industries that generate positive spillover
effects than to implement import restrictions.
D) Spillover effects from Silicon Valley were the biggest factor leading to increases in
productivity in the 1990s.
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'C'


 43%|████▎     | 97/223 [17:57<21:59, 10.48s/it]


Incorrect Prediction:
Question: The United States is not competitive with Brazil in sugar production partly because:
Choices:
A) the opportunity cost of land suitable for sugar production in the United States is
relatively high.
B) the opportunity cost of land suitable to sugar production in Brazil is relatively high.
C) the United States does not focus upon sugar production.
D) Brazil does not focus upon sugar production.
Raw model response: 'B'
Processed prediction: 'B'
Correct answer: 'A'


 44%|████▍     | 99/223 [18:18<21:33, 10.43s/it]


Incorrect Prediction:
Question: Governments can use tariffs to help domestic firms act like a cartel when selling to
international buyers:
Choices:
A) if it's unlikely that other governments would impose retaliatory tariffs.
B) and if all governments do this, greater gains are realized by all countries.
C) only if international buyers have few substitutes for the domestic good.
D) but there are no actual examples of governments trying to do this.
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'C'


 45%|████▌     | 101/223 [18:39<21:09, 10.41s/it]


Incorrect Prediction:
Question: If the U.S. government wanted to use strategic trade protectionism for U.S.-produced fertilizer it would:
Choices:
A) place high taxes on foreign-made fertilizer.
B) place a trade quota on foreign-made fertilizer.
C) subsidize U.S. producers of fertilizer.
D) place a tax or put a limit on the exports of U.S. fertilizer.
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'D'


 46%|████▌     | 103/223 [19:01<21:13, 10.61s/it]


Incorrect Prediction:
Question: The economics of international trade is substantially different from that of ordinary
trade.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 47%|████▋     | 104/223 [19:12<21:41, 10.94s/it]


Incorrect Prediction:
Question: International and intranational trade are very different in terms of economic analysis.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 48%|████▊     | 106/223 [19:34<21:15, 10.90s/it]


Incorrect Prediction:
Question: In the case of sugar, moving from a situation of no trade to free trade causes both
domestic consumption and production to increase.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 48%|████▊     | 108/223 [19:56<21:01, 10.97s/it]


Incorrect Prediction:
Question: Trade makes people better off when preferences are the same.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 50%|█████     | 112/223 [20:39<20:26, 11.05s/it]


Incorrect Prediction:
Question: If the United States imports teacups from other countries, then U.S. producers of teacups are better off, and U.S. consumers of teacups are worse off, as a result of trade.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 52%|█████▏    | 115/223 [21:11<19:24, 10.79s/it]


Incorrect Prediction:
Question: A quota is a stated quality standard that an imported good must reach before it can be
allowed into the borders of the importing country.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 52%|█████▏    | 117/223 [21:33<19:02, 10.78s/it]


Incorrect Prediction:
Question: If the world price of a good is greater than the domestic price in a country that can engage in international trade, then that country becomes an importer of that good.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 53%|█████▎    | 118/223 [21:45<19:08, 10.94s/it]


Incorrect Prediction:
Question: The tariff diagram illustrates that if the absolute value of the slopes of the demand and supply curves are equal, then the deadweight loss of any tariff always equals the wasted resources due to increased domestic production.
Choices:
A) True
B) False
Raw model response: 'B'
Processed prediction: 'B'
Correct answer: 'A'


 56%|█████▌    | 124/223 [22:50<17:35, 10.66s/it]


Incorrect Prediction:
Question: Protectionism policies restrain trade through price controls that burden foreign producers but not domestic producers.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 61%|██████    | 136/223 [25:00<15:55, 10.98s/it]


Incorrect Prediction:
Question: Free trade has reduced the number of jobs in U.S. manufacturing as well as the overall
number of jobs in the U.S. economy.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 64%|██████▍   | 143/223 [26:16<14:29, 10.87s/it]


Incorrect Prediction:
Question: Which statement below is correct?
Choices:
A) The HO model assumes that all resources can freely move between industries.
B) The specific-factors model assumes that all resources can freely move between industries.
C) Both the HO and the specific-factor models assume that all resources can freely move between industries.
D) Neither the HO nor the specific-factor model assumes that all resources can freely move between industries.
Raw model response: 'D'
Processed prediction: 'D'
Correct answer: 'A'


 65%|██████▍   | 144/223 [26:27<14:29, 11.01s/it]


Incorrect Prediction:
Question: The Heckscher-Ohlin model assumes that factors of production can move freely _______ , but cannot move _______.
Choices:
A) domestically; internationally
B) after they are fully trained; before the training period is over
C) internationally; domestically
D) within unskilled occupations; into high-skill jobs
Raw model response: 'C'
Processed prediction: 'C'
Correct answer: 'A'


 65%|██████▌   | 145/223 [26:37<14:03, 10.82s/it]


Incorrect Prediction:
Question: In a capital-intensive industry, the capital/labor ratio will:
Choices:
A) rise as the wage/rental ratio falls.
B) fall as the wage/rental ratio falls.
C) rise as the country's capital stock rises.
D) fall as the country's capital stock falls.
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 65%|██████▌   | 146/223 [26:49<14:11, 11.06s/it]


Incorrect Prediction:
Question: The Heckscher-Ohlin model assumes that production techniques within a nation use the factors of production:
Choices:
A) at different intensities depending on changing technology and which nation you are
discussing.
B) at different intensities for each industry, so that one is more or less intensive in that factor
than the other.
C) at the same intensity for each industry—for example, the ratio of capital to labor is the
same for every industry in the nation.
D) in no definite pattern.
Raw model response: 'C'
Processed prediction: 'C'
Correct answer: 'B'


 66%|██████▋   | 148/223 [27:11<13:49, 11.06s/it]


Incorrect Prediction:
Question: Which of the following is not an assumption that the Heckscher-Ohlin model makes?
Choices:
A) The quantity of capital and labor in two nations is different for each nation—so we have different “endowments” of capital and labor.
B) The quantity of capital and labor in two nations is relatively abundant in one nation and relatively scarce in the other.
C) The quantity of capital and labor in two nations is fixed in the short run.
D)  Labor and capital move between countries.
Raw model response: 'C'
Processed prediction: 'C'
Correct answer: 'D'


 67%|██████▋   | 150/223 [27:33<13:27, 11.06s/it]


Incorrect Prediction:
Question: The Possibilities Production Frontier is bowed out in the Heckscher-Ohlin model because:
Choices:
A) capital is specific to computer production.
B) labor is specific to shoe production.
C) capital is better suited to computer production than shoe production.
D) labor is specific to shoe production, capital is specific to computer production, and capital
is better suited to computer production than shoe production.
Raw model response: 'D'
Processed prediction: 'D'
Correct answer: 'C'
Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-12-01-preview have exceeded call rate limit of your current AIServices S0 pricing tier. Please retry after 4 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit. For Free Account customers, upgrade to Pay as you Go here: https://aka.ms/429TrialUpgrade.'}}


 69%|██████▊   | 153/223 [28:30<16:57, 14.53s/it]


Incorrect Prediction:
Question: Consider two products, automobiles and shoes. If shoes are labor intensive and automobiles are capital intensive, what can we expect in free-trade conditions?
Choices:
A) The relative price of automobiles in the auto-exporting country will decrease.
B) The relative price of shoes in the shoe-exporting country will increase.
C) More shoes will be produced by the capital-abundant country.
D) More automobiles will be produced by the labor-abundant country.
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 70%|███████   | 157/223 [29:12<12:32, 11.39s/it]


Incorrect Prediction:
Question: If there are only two nations, one nation's exports are the other's imports; which of the following is identical for both nations?
Choices:
A) equilibrium relative price
B) trade triangle
C) opportunity cost
D) equilibrium relative price, trade triangle, and opportunity cost
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'D'


 71%|███████   | 158/223 [29:22<12:01, 11.10s/it]


Incorrect Prediction:
Question: Suppose that Home is a labor-abundant country. When trade occurs with Foreign, a capital-
abundant country, the Hecksher-Ohlin model predicts that:
Choices:
A) the price of the labor-intensive good will rise in Home.
B) the price of the labor-intensive good will rise in Foreign.
C) the price of the capital-intensive good will fall in Foreign.
D) the price of the capital-intensive good will rise in Home.
Raw model response: 'B'
Processed prediction: 'B'
Correct answer: 'A'


 73%|███████▎  | 162/223 [30:16<13:27, 13.24s/it]


Incorrect Prediction:
Question: Compared with other countries, the United States' effective factor endowment is greatest for:
Choices:
A) capital
B) R&D scientists.
C) arable land.
D) unskilled labor.
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 73%|███████▎  | 163/223 [30:27<12:26, 12.44s/it]


Incorrect Prediction:
Question: If Japanese workers receive lower wages in the production of autos compared with American workers, then:
Choices:
A) Japan must have a comparative advantage in the production of autos.
B) Japan must have an absolute advantage in the production of autos.
C)  auto production costs must be lower in Japan than in the United States.
D) auto production costs could be lower in the United States if U.S. labor productivity is higher than Japanese labor productivity.
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'D'


 74%|███████▍  | 165/223 [30:48<11:02, 11.42s/it]


Incorrect Prediction:
Question: In a labor-abundant country, free trade will cause a(n) __________ in the rental of capital and a(n) _________ in the marginal product of capital.
Choices:
A) increase; increase
B) increase; decrease
C) decrease; decrease
D) decrease; increase
Raw model response: 'B'
Processed prediction: 'B'
Correct answer: 'C'


 75%|███████▍  | 167/223 [31:09<10:18, 11.05s/it]


Incorrect Prediction:
Question: With the “opening” of trade, the item exported experiences a ________ in demand and therefore a ________ in its relative (domestic) price, whereas the item imported experiences a(n) ________ in demand and therefore a(n) ________ in its relative (domestic) price.
Choices:
A) rise, rise; decrease, decrease
B) rise, fall; increase, decrease
C) fall, fall; increase, increase
D) fall, rise; decrease, increase
Raw model response: 'B'
Processed prediction: 'B'
Correct answer: 'A'


 76%|███████▌  | 169/223 [31:33<10:22, 11.53s/it]


Incorrect Prediction:
Question: The major difference between tariffs and import quotas is that:
Choices:
A) tariffs create deadweight losses, but import quotas do not.
B) tariffs help domestic consumers, and import quotas help domestic producers.
C) tariffs raise revenue for the government, but import quotas create a surplus for import license holders.
D) All of the above are correct.
Raw model response: 'D'
Processed prediction: 'D'
Correct answer: 'C'


 77%|███████▋  | 171/223 [31:55<09:45, 11.27s/it]


Incorrect Prediction:
Question: If the Japanese steel industry subsidizes the steel which it sells to the U.S.:
Choices:
A) the U.S. should protect its domestic steel industry from this unfair competition.
B) the harm done to U.S. steel producers from this unfair competition exceeds the gain to U.S. consumers of cheap Japanese steel
C) the harm done to U.S. steel producers is less than the benefit to U.S. consumers of steel.
D) the U.S. should subsidize the products it sells to Japan.
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'C'


 77%|███████▋  | 172/223 [32:05<09:20, 10.99s/it]


Incorrect Prediction:
Question: If the U.S. threatens to impose a tariff on German cars if Germany does not remove agricultural subsidies, what happens?
Choices:
A) the U.S. will be better off no matter how Germany responds.
B) the U.S. will be better off if Germany gives in, and will be no worse off if it doesn't.
C) the U.S. will be worse off if Germany doesn’t give in to the threat.
D) the U.S. will be worse off no matter how Germany responds.
Raw model response: 'B'
Processed prediction: 'B'
Correct answer: 'C'


 78%|███████▊  | 174/223 [32:27<08:54, 10.90s/it]


Incorrect Prediction:
Question: Trade decisions are based on the concept of absolute advantage.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 79%|███████▉  | 177/223 [33:00<08:25, 10.99s/it]


Incorrect Prediction:
Question: If the world price of a good is greater than the domestic price in a country that can engage in international trade, that country would become an importer of that good.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 81%|████████  | 180/223 [33:32<07:44, 10.81s/it]


Incorrect Prediction:
Question: If Peru exports coffee to the rest of the world, Peruvian producers of coffee are worse off as a result of trade, but Peruvian consumers of coffee are better off.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 82%|████████▏ | 182/223 [33:53<07:15, 10.62s/it]


Incorrect Prediction:
Question: In general, if a country allows trade and becomes an importer of a good, domestic producers of the good are worse off, domestic consumers of the good are better off, but the economic well-being of the country decreases.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 83%|████████▎ | 184/223 [34:16<07:06, 10.93s/it]


Incorrect Prediction:
Question: Suppose the Ivory Coast, a small country, imports wheat at the world price of $4 per bushel. If the Ivory Coast imposes a tariff of $1 per bushel on imported wheat, the price of wheat in Ivory Coast will increase, but by less than $1, ceteris paribus.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 84%|████████▍ | 188/223 [35:00<06:17, 10.79s/it]


Incorrect Prediction:
Question: Suppose France imposes a tariff on imported U.S. computers. The tariff will raise the price of computers, and will make both French producers and consumers of computers worse off.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 85%|████████▍ | 189/223 [35:10<06:02, 10.66s/it]


Incorrect Prediction:
Question: If a small country imposes a tariff on an imported good, domestic sellers will gain producer surplus, the government will gain tariff revenue, and domestic consumers will gain consumer surplus.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 88%|████████▊ | 196/223 [36:27<04:52, 10.82s/it]


Incorrect Prediction:
Question: If Canada were to subsidize the production of wool blankets, and sell them in the U.S. at artificially low prices, the U.S. economy would be worse off.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 89%|████████▉ | 199/223 [36:59<04:16, 10.68s/it]


Incorrect Prediction:
Question: The United States is a very open economy.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 91%|█████████ | 202/223 [37:32<03:49, 10.92s/it]


Incorrect Prediction:
Question: A country with higher demand for high-tech goods is more likely to have a comparative advantage in high-tech sectors.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 91%|█████████ | 203/223 [37:43<03:35, 10.76s/it]


Incorrect Prediction:
Question: Growth is more likely to increase welfare if it is export-biased.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 91%|█████████▏| 204/223 [37:53<03:21, 10.63s/it]


Incorrect Prediction:
Question: According to the Ricardian model, real income is only a function of relative produc-
tivity across sectors.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 92%|█████████▏| 206/223 [38:14<02:58, 10.49s/it]


Incorrect Prediction:
Question: Consider a world economy with two countries, France and Germany. There are two goods, Wine (W ) and Beer (B), with prices pW and pB under free trade. There are LF = 100 individuals in France, who can produce 1 bottle of Wine per person, whereas there are LG = 200 individuals in Germany, who can produce 2 bottles of beer
per person. Individuals in both countries spend 1/2 of their income on Wine and 1/2 on Beer. If the number of individuals in the Germany LG grows by 10%:
Choices:
A) Individuals in France are better oﬀ, but individuals in Germany are worse oﬀ.
B) Individuals in Germany are better oﬀ, but individuals in France are worse oﬀ.
C) Individuals in both countries are better oﬀ.
D) Individuals in France are better oﬀ, but individuals in the Germany are indiﬀerent.
Raw model response: 'B'
Processed prediction: 'B'
Correct answer: 'A'


 93%|█████████▎| 207/223 [38:24<02:47, 10.49s/it]


Incorrect Prediction:
Question: Two countries, Big and Small, with identical homothetic preferences, produce two goods, Aircrafts and Computers, using only labor, with constant returns to scale. Big has a labor supply of 200, whereas Small has a labor supply of 30. In Big, the available technology requires 10 units of labor to produce one Aircraft and 4 units of labor to produce one Computer. In Small, the unit labor requirements for Aircraft and Computer are 3 and 1, respectively. Then:
Choices:
A) Only workers in Big are strictly better oﬀ with free trade than in autarky.
B) Only workers in Small are strictly better oﬀ with free trade than in autarky.
C) Workers in both countries are strictly better oﬀ with free trade than in autarky.
D) We need more information to determine which workers are strictly better oﬀ.
Raw model response: 'C'
Processed prediction: 'C'
Correct answer: 'D'
Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operatio

 94%|█████████▎| 209/223 [39:14<03:52, 16.61s/it]


Incorrect Prediction:
Question: Consider a world economy with two countries, A and B, and three goods, X, Y and Z. The goods are produced with labor alone, with constant returns to scale. Let aIJ be the labor requirement to produce a unit of good I in country J. Suppose aXA/aXB > aY A/aY B > aZA/aZB . What is a true statment concerning  the pattern of trade?
Choices:
A) A might import goods X and Z.
B) A might export goods X and Z.
C) A might export goods Y and Z.
D) A might import goods Y and Z.
Raw model response: 'B'
Processed prediction: 'B'
Correct answer: 'C'


 95%|█████████▌| 212/223 [40:05<03:21, 18.30s/it]


Incorrect Prediction:
Question: The country Rich is relatively well endowed with skilled labor whereas its trade partner, Poor, is relatively well endowed with unskilled labor. The two countries produce and freely trade two goods using the same constant-returns-to-scale technolo-
gies. The countries have identical and homothetic preferences. In this setting, when trade opens:
Choices:
A) The real wage of skilled workers in Rich must rise, the real wage of unskilled
workers in Rich must fall, and the income rise for skilled workers need not exceed
the income fall for unskilled workers.
B) The real wage of unskilled workers in Rich must rise, the real wage of skilled
workers in Rich must fall, and the income rise for unskilled workers need not exceed the income fall for skilled workers
C) The real wage of unskilled workers in Rich must rise, the real wage of skilled
workers in Rich must fall, and the income rise for unskilled workers must exceed the income fall for skilled workers.
D) T

 96%|█████████▌| 213/223 [40:15<02:39, 15.93s/it]


Incorrect Prediction:
Question: A country is more likely to benefit if
Choices:
A) It is large and taxes imports.
B) It is small and subsidizes exports.
C) It is large and subsidizes exports.
D) It is small and taxes imports.
Raw model response: 'C'
Processed prediction: 'C'
Correct answer: 'A'


 96%|█████████▌| 214/223 [40:26<02:09, 14.41s/it]


Incorrect Prediction:
Question: A country imports chocolate. Imposing an import tariff on chocolate is more likely to be better than imposing an import quota if:
Choices:
A) The country is large.
B) The country is auctioning export licenses to foreigners.
C) The country has a single producer of chocolate.
D) None of the above.
Raw model response: 'D'
Processed prediction: 'D'
Correct answer: 'C'


 96%|█████████▋| 215/223 [40:37<01:45, 13.20s/it]


Incorrect Prediction:
Question: A small open economy exports bicycles. If it introduces a subsidy on exports of bicycles, this will:
Choices:
A) Decrease exports of bicycles.
B) Decrease domestic consumption of bicycles.
C) Decrease domestic production of bicycles.
D) None of the above.
Raw model response: 'D'
Processed prediction: 'D'
Correct answer: 'B'


 97%|█████████▋| 216/223 [40:47<01:26, 12.38s/it]


Incorrect Prediction:
Question: There are two large countries, the United States and China, and two goods, solar panels and soy bean. The United exports soy beans and imports solar panels. If the United States imposes a small import tariff on solar panels, whereas China imposes a small import tariff on soy beans, then:
Choices:
A) Both countries are better off than under free trade.
B) Both countries are worse off than under free trade.
C) The United States is better off than under free trade, but China is worse off.
D) China is better off than under free trade, but the United States is worse off.
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'
Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-12-01-preview have exceeded call rate limit of your current AIServices S0 pricing tier. Please retry after 2 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would lik

 97%|█████████▋| 217/223 [41:26<02:01, 20.28s/it]


Incorrect Prediction:
Question: The production of beef generates negative externalities caused by carbon emissions. If a small open economy exports beef, it will benefit from:
Choices:
A) A small tax on beef consumption.
B) A small subsidy on beef exports.
C) Neither of the above.
D) Both of the above.
Raw model response: 'B'
Processed prediction: 'B'
Correct answer: 'C'


 98%|█████████▊| 218/223 [41:36<01:26, 17.32s/it]


Incorrect Prediction:
Question: In a Ricardian model, workers employed in import-competing sectors are more likely to oppose trade.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


 99%|█████████▊| 220/223 [42:00<00:43, 14.38s/it]


Incorrect Prediction:
Question: Factor prices are more likely to be equalized across countries if countries are completely specialized.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'


100%|██████████| 223/223 [42:31<00:00, 11.44s/it]


Incorrect Prediction:
Question: Offshoring cannot raise the real wage of the workers whose jobs are being offshored.
Choices:
A) True
B) False
Raw model response: 'A'
Processed prediction: 'A'
Correct answer: 'B'

Detailed Answer Matching:
------------------------
Found 76 mismatches out of 223 questions

Results for gpt-35-turbo
Overall Accuracy: 0.6592 (147/223 correct)

Per-Category Performance:
------------------------------
Theory:
  Accuracy: 0.7500
  Correct: 27/36

Numerical:
  Accuracy: 0.5455
  Correct: 6/11

Grouping:
  Accuracy: 0.6000
  Correct: 3/5

Fill In Blank:
  Accuracy: 0.6585
  Correct: 54/82

True False:
  Accuracy: 0.6404
  Correct: 57/89






In [8]:
from openai import AzureOpenAI
import time

# Correct endpoint format (remove the extra path and query parameters)
endpoint = "https://sebas-m88z4ckk-eastus2.cognitiveservices.azure.com/"
deployment = "gpt-35-turbo"
api_version = "2024-12-01-preview"
subscription_key = "QcCAEjEb4AjlMq3HXAqQh6dOUB7Ft6A5sWVz1ODKUU5TsVlCtkaOJQQJ99BCACHYHv6XJ3w3AAAAACOGe30X"

client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
)

def make_request_with_retry(max_retries=3, retry_delay=60):
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=deployment,
                messages=[
                    {"role": "user", "content": "What is 3+3? Give a one word response."}
                ],
                max_tokens=10,
                temperature=0.0,
                top_p=1.0
            )
            return response
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {str(e)}")
            if attempt < max_retries - 1:
                print(f"Waiting {retry_delay} seconds before retrying...")
                time.sleep(retry_delay)
            else:
                print("All attempts failed")
                raise e

# Make the request with retry logic
try:
    response = make_request_with_retry()
    print(response.choices[0].message.content)
except Exception as e:
    print(f"Final error: {str(e)}")

Six.


In [2]:
def get_zero_shot_results_from_llama(df_test, model, tokenizer):
    y_true = []
    y_pred = []
    
    for row in tqdm(df_test.iterrows(), total=len(df_test)):
        question = row[1]["Question"]
        choice_a = row[1]["OptionA"]
        choice_b = row[1]["OptionB"]
        choice_c = row[1]["OptionC"]
        choice_d = row[1]["OptionD"]
        label = row[1]["Answer"]  # Assuming label is 'A', 'B', 'C', or 'D'
        
        prompt = f"""Please answer this multiple choice question. Only respond with the letter of your answer (A, B, C, or D).

Question: {question}

A) {choice_a}
B) {choice_b}
C) {choice_c}
D) {choice_d}

Answer: """
        
        model_answer = prompt_llama_like_model(prompt, model, tokenizer, max_new_tokens=10)
        # Extract just the letter answer from the model's response
        model_answer = model_answer.split("Answer:")[-1].strip().upper()
        
        # For debugging first few examples
        if row[0] < 5:
            print(f"Question: {question}")
            print(f"Model Answer: {model_answer}")
            print(f"Correct Answer: {label}")
            print("-------------------")
        
        # Convert letter answers to predictions
        y_pred.append(model_answer)
        y_true.append(label)
    
    return y_true, y_pred

# Remove comments on your desired model

# LLAMA2-7B-chat
model_name = "../llama/Llama-2-7b-chat-hf"
model = AutoModelForCausalLM.from_pretrained(model_name,     
                                             device_map="auto",     
                                             max_memory={2:"24GB",3:"24GB"},      
                                             torch_dtype=torch.float16,
    )
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


y_true, y_pred = get_zero_shot_results_from_llama(df_test, model, tokenizer)
print(classification_report(y_true=y_true,y_pred=y_pred,digits=4))

with open("results/LLM_results.txt","a") as f:
    f.write(model_name+", Zero-shot \n")
    f.write(classification_report(y_true=y_true,y_pred=y_pred,digits=4))
    f.write("\n")

OSError: Incorrect path_or_model_id: '../llama/Llama-2-7b-chat-hf'. Please provide either the path to a local folder or the repo_id of a model on the Hub.