In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q huggingface_hub
from huggingface_hub import login

# Paste your Hugging Face token (go to https://huggingface.co/settings/tokens to generate it)
login("Your API Key")

In [None]:
# 1. Install and import requirements
!pip install -q google-generativeai
import google.generativeai as genai
import pandas as pd
import time

# 2. Configure Gemini API key
genai.configure(api_key="Your API Key")

# 3. Load your dataset from Google Drive
df = pd.read_csv('/content/drive/MyDrive/Mediacloud/artificial_intelligence_uk_2025-05-05_2025-05-06_mediacloud.csv')

# 4. Define the Gemini model
model = genai.GenerativeModel("gemini-2.0-flash")

# 5. Define the filtering prompt
def build_prompt(article_text):
    return f"""
The following article discusses artificial intelligence (AI).

Please read the article and identify **which of the following frames apply**. You can apply only one frame, choose the most fitting. Only choose from this list:

1. AI impacts on businesses, economy, and jobs
2. AI transformations in education and research
3. AI in national security and global partnerships
4. AI disruptions in media and creative industries
5. AI-based innovative solutions
6. AI regulations, ethics, and data privacy
7. AI competition and market dynamics in tech industries
8. AI in healthcare and climate change
9. AI in politics, elections, and public opinion
10. Other
11. Not AI related

### Article:
{article_text}

### Instructions:
Return your answer as the name of the frame, e.g. "AI impacts on businesses, economy, and jobs".
In case of other, return "Other - name of the frame".

Do not explain your reasoning.
"""

# 6. Ask Gemini for each article
def ask_gemini(prompt):
    try:
        response = model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        print(f"Error: {e}")
        return "error"

# 7. Loop through maintext column and classify
results = []

for i, row in df.iterrows():
    article = row.get("maintext", "")
    if not isinstance(article, str) or not article.strip():
        results.append("skipped")
        continue

    prompt = build_prompt(article)
    answer = ask_gemini(prompt)
    print(f"[{i}] → {answer}")
    results.append(answer)

    # Optional: pause to avoid quota limits
    time.sleep(4)

# 8. Save to DataFrame and Drive
df["frame"] = results
df.to_csv("/content/drive/MyDrive/Mediacloud/ai_articles_with_frames_gemini.csv", index=False)
print("✅ Saved as ai_articles_with_public_opinion.csv")

[0] → AI in healthcare and climate change
[1] → AI in healthcare and climate change
[2] → Not AI related
[3] → AI impacts on businesses, economy, and jobs
[4] → AI in healthcare and climate change
[5] → AI regulations, ethics, and data privacy
[6] → AI in healthcare and climate change
[7] → AI regulations, ethics, and data privacy
[8] → AI in healthcare and climate change
[9] → AI regulations, ethics, and data privacy
[10] → AI impacts on businesses, economy, and jobs
[11] → AI regulations, ethics, and data privacy
[12] → AI disruptions in media and creative industries
[14] → Other - Climate change policy and influence
[15] → AI regulations, ethics, and data privacy
[16] → AI-based innovative solutions
[17] → AI in national security and global partnerships
[18] → AI impacts on businesses, economy, and jobs
[19] → AI impacts on businesses, economy, and jobs
[20] → AI transformations in education and research
[21] → AI in healthcare and climate change
[22] → Other - Functional Beverage M

In [None]:
df.frame.value_counts()

Unnamed: 0_level_0,count
frame,Unnamed: 1_level_1
"AI impacts on businesses, economy, and jobs",19
AI in healthcare and climate change,15
"AI regulations, ethics, and data privacy",10
AI-based innovative solutions,7
Not AI related,3
AI transformations in education and research,3
AI disruptions in media and creative industries,3
AI in national security and global partnerships,2
skipped,2
Other - Climate change policy and influence,1


In [None]:
# Step 1: Install dependencies
#!pip install -q transformers accelerate sentencepiece

# Step 2: Import libraries
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import pandas as pd
from tqdm import tqdm

# Step 3: Load the Gemma model (7B is the largest you can use for free in Colab)
model_name = "google/gemma-2b-it"  # smaller, more Colab-friendly
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [None]:
# Step 4: Load your data
#df = pd.read_csv('/content/drive/MyDrive/Mediacloud/artificial_intelligence_uk_2025-05-05_2025-05-06_mediacloud.csv')
df = pd.read_csv('artificial_intelligence_uk_2025-05-05_2025-05-06_mediacloud.csv')

# Step 5: Define your prompt
def build_prompt(article_text):
    return f"""
The following article discusses artificial intelligence (AI).

Please identify the **most fitting frame** from the list below:
1. AI impacts on businesses, economy, and jobs
2. AI transformations in education and research
3. AI in national security and global partnerships
4. AI disruptions in media and creative industries
5. AI-based innovative solutions
6. AI regulations, ethics, and data privacy
7. AI competition and market dynamics in tech industries
8. AI in healthcare and climate change
9. AI in politics, elections, and public opinion
10. Other
11. Not AI related

### Article:
{article_text}

### Instructions:
Return only the frame name (e.g. "AI in politics, elections, and public opinion").
If it's unclear, return "Other" or "Not AI related".
"""

# Step 6: Run classification
results = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    text = row.get("maintext", "")
    if not isinstance(text, str) or not text.strip():
        results.append("skipped")
        continue

    prompt = build_prompt(text[:2000])  # truncate if article is too long
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=50)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    frame = generated_text.strip().split("\n")[-1]
    results.append(frame)

# Step 7: Save results
df["frame"] = results
#df.to_csv("/content/drive/MyDrive/Mediacloud/ai_articles_with_frames_gemma.csv", index=False)
print("✅ Done. Saved to ai_articles_with_frames_gemma.csv")

100%|██████████| 70/70 [00:10<00:00,  6.76it/s]

✅ Done. Saved to ai_articles_with_frames_gemma.csv





In [None]:
df.frame.value_counts()

Unnamed: 0_level_0,count
frame,Unnamed: 1_level_1
"If it's unclear, return ""Other"" or ""Not AI related"".",68
skipped,2


In [None]:
# Step 1: Install dependencies
!pip install -q transformers accelerate sentencepiece

# Step 2: Import libraries
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm

# Step 3: Load the Gemma 3-4B model and tokenizer (use float16 for T4 GPU)
model_id = "google/gemma-3-4b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,  # 🟢 compatible with Colab GPU
    device_map="auto"
)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

In [None]:
# Step 4: Load your data
#df = pd.read_csv('/content/drive/MyDrive/Mediacloud/artificial_intelligence_uk_2025-05-05_2025-05-06_mediacloud.csv')
df = pd.read_csv('artificial_intelligence_uk_2025-05-05_2025-05-06_mediacloud.csv')

# Step 5: Define your prompt
def build_prompt(article_text):
    return f"""
The following article discusses artificial intelligence (AI).

Please identify the **most fitting frame** from the list below:
1. AI impacts on businesses, economy, and jobs
2. AI transformations in education and research
3. AI in national security and global partnerships
4. AI disruptions in media and creative industries
5. AI-based innovative solutions
6. AI regulations, ethics, and data privacy
7. AI competition and market dynamics in tech industries
8. AI in healthcare and climate change
9. AI in politics, elections, and public opinion
10. Other
11. Not AI related

### Article:
{article_text}

### Instructions:
Return only the frame name (e.g. "AI in politics, elections, and public opinion").
If it's unclear, return "Other" or "Not AI related".
"""

# Step 6: Run frame classification
frames = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    article = row.get("maintext", "")
    if not isinstance(article, str) or not article.strip():
        frames.append("skipped")
        continue

    prompt = build_prompt(article[:1500])  # truncate long articles
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    try:
        outputs = model.generate(
            **inputs,
            max_new_tokens=80,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id  # ✅ ensure end of generation
        )
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        last_line = decoded.strip().split("\n")[-1]
        frames.append(last_line)
    except Exception as e:
        print(f"Error at index {i}: {e}")
        frames.append("error")

# Step 7: Save results
df["frame"] = frames
#df.to_csv('/content/drive/MyDrive/Mediacloud/ai_articles_with_frames_gemma_3_4b.csv', index=False)
print("✅ Saved to: ai_articles_with_frames_gemma_3_4b.csv")

  0%|          | 0/70 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
W0706 15:05:46.781000 731 torch/_inductor/utils.py:1137] [0/0] Not enough SMs to use max_autotune_gemm mode
  1%|▏         | 1/70 [01:57<2:15:36, 117.91s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  3%|▎         | 2/70 [02:01<57:37, 50.84s/it]   The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  4%|▍         | 3/70 [02:05<32:55, 29.49s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  6%|▌         | 4/70 [02:09<21:19, 19.38s/it]The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=in

✅ Saved to: ai_articles_with_frames_gemma_3_4b.csv


In [None]:
df.frame.value_counts()

Unnamed: 0_level_0,count
frame,Unnamed: 1_level_1
"If it's unclear, return ""Other"" or ""Not AI related"".",68
skipped,2


In [None]:
# Step 1: Install dependencies
#!pip install -q transformers accelerate sentencepiece

# Step 2: Import libraries
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm

# Step 3: Load Mistral model & tokenizer
model_id = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



In [None]:
# Step 4: Load your dataset
df = pd.read_csv('/content/drive/MyDrive/Mediacloud/artificial_intelligence_uk_2025-05-05_2025-05-06_mediacloud.csv')
#df = pd.read_csv('artificial_intelligence_uk_2025-05-05_2025-05-06_mediacloud.csv')

# Step 5: Define the prompt template
def build_prompt(article_text):
    return f"""<s>[INST] You are an expert in AI media analysis.

Given the following article about artificial intelligence (AI), classify it using the most fitting frame from this list:

1. AI impacts on businesses, economy, and jobs
2. AI transformations in education and research
3. AI in national security and global partnerships
4. AI disruptions in media and creative industries
5. AI-based innovative solutions
6. AI regulations, ethics, and data privacy
7. AI competition and market dynamics in tech industries
8. AI in healthcare and climate change
9. AI in politics, elections, and public opinion
10. Other
11. Not AI related

Return only the frame name. If no frame fits, return "Other" or "Not AI related".

### Article:
{article_text}

Answer: [/INST]
"""

# Step 6: Classify each article
frames = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    article = row.get("maintext", "")
    if not isinstance(article, str) or not article.strip():
        frames.append("skipped")
        continue

    prompt = build_prompt(article[:1500])  # truncate long input
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    try:
        outputs = model.generate(
            **inputs,
            max_new_tokens=60,
            do_sample=False,
            temperature=0.0,
            eos_token_id=tokenizer.eos_token_id
        )
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract only what's generated after the prompt
        generated_answer = decoded[len(prompt):].strip().split("\n")[0]
        frames.append(generated_answer if generated_answer else "blank")
    except Exception as e:
        print(f"Error at index {i}: {e}")
        frames.append("error")

# Step 7: Save results
df["frame"] = frames
df.to_csv('/content/drive/MyDrive/Mediacloud/ai_articles_with_frames_mistral.csv', index=False)
print("✅ Done! Saved as ai_articles_with_frames_mistral.csv")

  0%|          | 0/70 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|▏         | 1/70 [00:05<06:49,  5.93s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  3%|▎         | 2/70 [00:10<05:57,  5.26s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  4%|▍         | 3/70 [00:31<13:41, 12.25s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  6%|▌         

✅ Done! Saved as ai_articles_with_frames_mistral.csv





In [None]:
df.frame.value_counts()

Unnamed: 0_level_0,count
frame,Unnamed: 1_level_1
"AI impacts on businesses, economy, and jobs",37
"e most fitting frame for this article is ""AI disruptions in media and creative industries"".",6
er,6
"e most fitting frame for this article is ""AI impacts on businesses, economy, and jobs"".",3
"e most fitting frame for this article is ""AI-based innovative solutions"".",3
"e article is not related to AI, so the most fitting frame would be ""Other"" or ""Not AI related"".",2
"e most fitting frame for this article is ""AI in healthcare and climate change"".",2
skipped,2
"e article is about the growth of the Digital Wound Management Devices Market and its potential for continued growth due to the increasing demand for advanced technologies that enhance wound monitoring, improve treatment outcomes, and streamline clinical workflows. The market is witnessing strong adoption across healthcare facilities, particularly in",1
"e most fitting frame for this article is ""AI in media and creative industries"".",1


In [None]:
# Step 7: Clean frame values to extract only the correct label
known_frames = [
    "AI impacts on businesses, economy, and jobs",
    "AI transformations in education and research",
    "AI in national security and global partnerships",
    "AI disruptions in media and creative industries",
    "AI-based innovative solutions",
    "AI regulations, ethics, and data privacy",
    "AI competition and market dynamics in tech industries",
    "AI in healthcare and climate change",
    "AI in politics, elections, and public opinion",
    "Other",
    "Not AI related"
]

def extract_clean_frame(raw_output):
    for frame in known_frames:
        if frame.lower() in raw_output.lower():
            return frame
    return "unmatched"  # fallback if nothing matched

df["frame"] = df["frame"].apply(extract_clean_frame)
df.to_csv('/content/drive/MyDrive/Mediacloud/ai_articles_with_frames_mistral_2.csv', index=False)

In [None]:
df.frame.value_counts()

Unnamed: 0_level_0,count
frame,Unnamed: 1_level_1
"AI impacts on businesses, economy, and jobs",40
unmatched,10
AI disruptions in media and creative industries,9
AI-based innovative solutions,3
Other,2
AI in healthcare and climate change,2
Not AI related,2
"AI regulations, ethics, and data privacy",1
AI transformations in education and research,1


In [None]:
# Step 1: Install dependencies
#!pip install -q transformers accelerate sentencepiece

# Step 2: Import libraries
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm

# Step 3: Load Mistral model & tokenizer
model_id = "deepseek-ai/deepseek-llm-7b-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/792 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/584 [00:00<?, ?B/s]

pytorch_model.bin.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

In [None]:
# Step 4: Load your dataset
#df = pd.read_csv('/content/drive/MyDrive/Mediacloud/artificial_intelligence_uk_2025-05-05_2025-05-06_mediacloud.csv')
df = pd.read_csv('artificial_intelligence_uk_2025-05-05_2025-05-06_mediacloud.csv')

# Step 5: Define the prompt template
def build_prompt(article_text):
    return f"""
The following article discusses artificial intelligence (AI).

Please read the article and identify **which of the following frames apply**. You can apply only one frame, choose the most fitting. Only choose from this list:

1. AI impacts on businesses, economy, and jobs
2. AI transformations in education and research
3. AI in national security and global partnerships
4. AI disruptions in media and creative industries
5. AI-based innovative solutions
6. AI regulations, ethics, and data privacy
7. AI competition and market dynamics in tech industries
8. AI in healthcare and climate change
9. AI in politics, elections, and public opinion
10. Other
11. Not AI related

### Article:
{article_text}

### Instructions:
Return your answer as the name of the frame, e.g. "AI impacts on businesses, economy, and jobs".
In case of other, return "Other - name of the frame".

Do not explain your reasoning.
"""

# Step 6: Classify each article
frames = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    article = row.get("maintext", "")
    if not isinstance(article, str) or not article.strip():
        frames.append("skipped")
        continue

    prompt = build_prompt(article[:1500])  # truncate long input
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    try:
        outputs = model.generate(
            **inputs,
            max_new_tokens=60,
            do_sample=False,
            temperature=0.0,
            eos_token_id=tokenizer.eos_token_id
        )
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract only what's generated after the prompt
        generated_answer = decoded[len(prompt):].strip().split("\n")[0]
        frames.append(generated_answer if generated_answer else "blank")
    except Exception as e:
        print(f"Error at index {i}: {e}")
        frames.append("error")

# Step 7: Save results
df["frame"] = frames
#df.to_csv('/content/drive/MyDrive/Mediacloud/ai_articles_with_frames_mistral.csv', index=False)
print("✅ Done! Saved as ai_articles_with_frames_mistral.csv")

  0%|          | 0/70 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
  1%|▏         | 1/70 [00:06<07:14,  6.29s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
  3%|▎         | 2/70 [00:13<07:25,  6.55s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
  4%|▍         | 3/70 [00:16<05:59,  5.37s/it]The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generat

✅ Done! Saved as ai_articles_with_frames_mistral.csv





In [None]:
df.frame.value_counts()

Unnamed: 0_level_0,count
frame,Unnamed: 1_level_1
### Example:,55
### Scoring:,13
skipped,2


In [None]:
# Step 1: Install dependencies
#!pip install -q transformers accelerate sentencepiece

# Step 2: Import libraries
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm

# Step 3: Load Mistral model & tokenizer
model_id = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]



In [None]:
# Step 4: Load your dataset
#df = pd.read_csv('/content/drive/MyDrive/Mediacloud/artificial_intelligence_uk_2025-05-05_2025-05-06_mediacloud.csv')
df = pd.read_csv('artificial_intelligence_uk_2025-05-05_2025-05-06_mediacloud.csv')

# Step 5: Define the prompt template
def build_prompt(article_text):
    return f"""
The following article discusses artificial intelligence (AI).

Please read the article and identify **which of the following frames apply**. You can apply only one frame, choose the most fitting. Only choose from this list:

1. AI impacts on businesses, economy, and jobs
2. AI transformations in education and research
3. AI in national security and global partnerships
4. AI disruptions in media and creative industries
5. AI-based innovative solutions
6. AI regulations, ethics, and data privacy
7. AI competition and market dynamics in tech industries
8. AI in healthcare and climate change
9. AI in politics, elections, and public opinion
10. Other
11. Not AI related

### Article:
{article_text}

### Instructions:
Return your answer as the name of the frame, e.g. "AI impacts on businesses, economy, and jobs".
In case of other, return "Other - name of the frame".

Do not explain your reasoning.
"""

# Step 6: Classify each article
frames = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    article = row.get("maintext", "")
    if not isinstance(article, str) or not article.strip():
        frames.append("skipped")
        continue

    prompt = build_prompt(article[:1500])  # truncate long input
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    try:
        outputs = model.generate(
            **inputs,
            max_new_tokens=60,
            do_sample=False,
            temperature=0.0,
            eos_token_id=tokenizer.eos_token_id
        )
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract only what's generated after the prompt
        generated_answer = decoded[len(prompt):].strip().split("\n")[0]
        frames.append(generated_answer if generated_answer else "blank")
    except Exception as e:
        print(f"Error at index {i}: {e}")
        frames.append("error")

# Step 7: Save results
df["frame"] = frames
#df.to_csv('/content/drive/MyDrive/Mediacloud/ai_articles_with_frames_mistral.csv', index=False)
print("✅ Done! Saved as ai_articles_with_frames_mistral.csv")

  0%|          | 0/70 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  1%|▏         | 1/70 [00:38<44:21, 38.57s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  3%|▎         | 2/70 [01:03<34:55, 30.81s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  4%|▍         | 3/70 [01:58<46:36, 41.74s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_t

✅ Done! Saved as ai_articles_with_frames_mistral.csv





In [None]:
df.frame.value_counts()

Unnamed: 0_level_0,count
frame,Unnamed: 1_level_1
Do not provide any additional information.,66
skipped,2
Do not give any additional information.,1
o not provide any additional information.,1


In [None]:
# Step 1: Install dependencies
#!pip install -q transformers accelerate sentencepiece

# Step 2: Import libraries
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm

# Step 3: Load Mistral model & tokenizer
model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/92.7k [00:00<?, ?B/s]

Fetching 19 files:   0%|          | 0/19 [00:00<?, ?it/s]

model-00005-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00001-of-00019.safetensors:   0%|          | 0.00/4.89G [00:00<?, ?B/s]

model-00003-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00006-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00008-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00007-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00009-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00010-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00011-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00013-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00012-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00014-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00015-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00016-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]



model-00017-of-00019.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]



model-00018-of-00019.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]



model-00019-of-00019.safetensors:   0%|          | 0.00/4.22G [00:00<?, ?B/s]

OSError: [Errno 28] No space left on device

In [None]:
# Step 1: Install dependencies
#!pip install -q transformers accelerate sentencepiece

# Step 2: Import libraries
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm

# Step 3: Load Mistral model & tokenizer
model_id = "deepseek-ai/DeepSeek-R1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

ValueError: FP8 quantized models is only supported on GPUs with compute capability >= 8.9 (e.g 4090/H100), actual = `7.5`

In [None]:
# Step 1: Install dependencies
#!pip install -q transformers accelerate sentencepiece

# Step 2: Import libraries
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm

# Step 3: Load Mistral model & tokenizer
model_id = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/7.39G [00:00<?, ?B/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]



In [None]:
# Step 4: Load your dataset
df = pd.read_csv('/content/drive/MyDrive/Mediacloud/artificial_intelligence_uk_2025-05-05_2025-05-06_mediacloud.csv')
#df = pd.read_csv('artificial_intelligence_uk_2025-05-05_2025-05-06_mediacloud.csv')

# Step 5: Define the prompt template
def build_prompt(article_text):
    return f"""
The following article discusses artificial intelligence (AI).

Please read the article and identify **which of the following frames apply**. You can apply only one frame, choose the most fitting. Only choose from this list:

1. AI impacts on businesses, economy, and jobs
2. AI transformations in education and research
3. AI in national security and global partnerships
4. AI disruptions in media and creative industries
5. AI-based innovative solutions
6. AI regulations, ethics, and data privacy
7. AI competition and market dynamics in tech industries
8. AI in healthcare and climate change
9. AI in politics, elections, and public opinion
10. Other
11. Not AI related

### Article:
{article_text}

### Instructions:
Return your answer as the name of the frame, e.g. "AI impacts on businesses, economy, and jobs".
In case of other, return "Other - name of the frame".

Do not explain your reasoning.
"""

# Step 6: Classify each article
frames = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    article = row.get("maintext", "")
    if not isinstance(article, str) or not article.strip():
        frames.append("skipped")
        continue

    prompt = build_prompt(article[:1500])  # truncate long input
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    try:
        outputs = model.generate(
            **inputs,
            max_new_tokens=60,
            do_sample=False,
            temperature=0.0,
            eos_token_id=tokenizer.eos_token_id
        )
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract only what's generated after the prompt
        generated_answer = decoded[len(prompt):].strip().split("\n")[0]
        frames.append(generated_answer if generated_answer else "blank")
    except Exception as e:
        print(f"Error at index {i}: {e}")
        frames.append("error")

# Step 7: Save results
df["frame"] = frames
#df.to_csv('/content/drive/MyDrive/Mediacloud/ai_articles_with_frames_mistral.csv', index=False)
print("✅ Done! Saved as ai_articles_with_frames_mistral.csv")

  0%|          | 0/70 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|▏         | 1/70 [00:51<58:39, 51.01s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         | 2/70 [01:40<57:02, 50.32s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  4%|▍         | 3/70 [02:30<55:54, 50.07s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_t

✅ Done! Saved as ai_articles_with_frames_mistral.csv





In [None]:
df.frame.value_counts()

Unnamed: 0_level_0,count
frame,Unnamed: 1_level_1
Just provide the article's frame.,33
Just provide the frame number or name.,29
Just provide the answer.,3
skipped,2
Just provide the article and the frame.,2
Just provide the frame number.,1
