# Sampling Data

In [1]:
import os
import pandas as pd

In [2]:
ROOT_DATA_PATH = r"D:\Studying\UoR\1. Data Mining\Final_Project\data"
CLEANED_DATA_PATH = os.path.join(ROOT_DATA_PATH, "cleaned")
SAVE_DATA_PATH = os.path.join(ROOT_DATA_PATH, "llama2_finetune_data")

### Original Prompt

In [3]:
# def format(row):
#     text_format = """As a neutral news analyst, you evaluate articles based on sentiment and stance, each scored from 0.0 (negative/against) to 1.0 (positive/in favor), with 0.5 as neutral/impartial.

# Given the article snippet below
# Title: "{title}"
# Content: "{content}{dot}"

# Provide:
# 1. Sentiment 
#     * Score: <SENTIMENT_SCORE>
#     * Reason: <SENTIMENT_REASON>
# 2. Stance
#     * Score: <STANCE_SCORE>
#     * Reason: <STANCE_REASON>

# Replace <SENTIMENT_SCORE>/<STANCE_SCORE> with your scores, and <SENTIMENT_REASON>/<STANCE_REASON> with one sentence for your reasoning."""
#     title = row[1]
#     content = row[0][:1024]
#     dot = '...' if len(row[0]) > len(content) else ''
#     return text_format.format(title=title, content=content, dot=dot)

### [Prompt Perfect](https://promptperfect.jina.ai/prompts)

In [5]:
def format(row, country):
    title = row[1]
    content = row[0][:1024]
    dot = '...' if len(row[0]) > len(content) else ''
    return f"""As a neutral news analyst, assess the sentiment and stance of the news article excerpt and assign a score between -1.0 (completely negative/against-{country}) and 1.0 (completely positive/pro-{country}) for both sentiment and stance. Provide a single short sentence to justify your scores, drawing on the article's language, tone, and presentation to support your analysis.

Article Excerpt:
- Title: "{title}"
- Content: "{content}{dot}"

Output format: 
1. Sentiment: [Positive/Neutral/Negative]
    * Score: [Your Score]
    * Reason: [Your Reason] 
2. Stance: [Pro-{country}/Impartial/Against-{country}]
    * Score: [Your Score]
    * Reason: [Your Reason]"""

### Example Count Calculations

* Local Source: 50
* International Source: 50
    * Associated Press: 10
    * Reuters: 40

Total: 100 * 3 = 300

In [6]:
def get_count(file_name):
    if file_name.split("_")[0] == "ap":
        count = 10
    elif file_name.split("_")[0] == "reuters":
        count = 40
    else:
        count = 50
    return count

def get_country(file_name):
    if file_name.find("canada") != -1:
        country = "canada"
    elif file_name.find("china") != -1:
        country = "china"
    elif file_name.find("russia") != -1:
        country = "russia"
    elif file_name.find("cbc") != -1:
        country = "canada"
    elif file_name.find("global") != -1:
        country = "canada"
    elif file_name.find("moscow") != -1:
        country = "russia"
    return country

def create_sample_df(file_name, df):
    count = get_count(file_name)
    country = get_country(file_name)
    
    sample = df.sample(count, random_state=29)
    sample["text"] = [format(row, country) for row in sample.values]
    sample["answer"] = [""] * count
    return sample

# for file_name in os.listdir(CLEANED_DATA_PATH):
#     input_file_path = os.path.join(CLEANED_DATA_PATH, file_name)
#     df = pd.read_csv(input_file_path, usecols=["title", "maintext"])

#     sample = create_sample_df(file_name, df)
    
#     output_file_path = os.path.join(SAVE_DATA_PATH, 
#                                     file_name.split(".")[0] + ".txt")
#     output_file_path = os.path.join(SAVE_DATA_PATH, file_name)
#     sample.to_csv(output_file_path, columns=["text", "answer"], index=None)

# Fine-tune Data

In [3]:
OUTPUT_FILE_PATH = os.path.join(SAVE_DATA_PATH, "fine-tune-data.csv")

In [4]:
def fine_tune_row(row):
    return f"###Human:\n{row[0]}\n\n###Assistant:\n{row[1]}"

In [5]:
for file_name in os.listdir(SAVE_DATA_PATH):
    try:
        input_file_path = os.path.join(SAVE_DATA_PATH, file_name)
        df = pd.read_csv(input_file_path)
        df.rename(columns={"text": "prompt"}, inplace=True)
        df["text"] = [fine_tune_row(row) for row in df.values]
        df.to_csv(OUTPUT_FILE_PATH, index=None, mode="a", 
                  header=not os.path.exists(OUTPUT_FILE_PATH))
    except:
        print(input_file_path)
        raise

In [6]:
pd.read_csv(OUTPUT_FILE_PATH).shape

(299, 3)

# Inference Data

In [3]:
INPUT_DIR = r"D:\Studying\UoR\1. Data Mining\Final_Project\data\cleaned\combined"
ROOT_OUTPUT_PATH = r"D:\Studying\UoR\1. Data Mining\Final_Project\data\llama2_inference_data"
OUTPUT_FILE_PATH = os.path.join(ROOT_OUTPUT_PATH, "inference-data.csv")

In [7]:
for file_name in os.listdir(INPUT_DIR):
    try:
        input_file_path = os.path.join(INPUT_DIR, file_name)
        df = pd.read_csv(input_file_path)
        
        country = get_country(file_name)
        df["country"] = [country] * df.shape[0]
        df["text"] = [format(row, country) for row in df.loc[:, ["maintext", "title"]].values]
        
        df.to_csv(OUTPUT_FILE_PATH, mode="a", index=None,
                  header=not os.path.exists(OUTPUT_FILE_PATH))
    except:
        print(input_file_path)
        raise

In [11]:
df = pd.read_csv(OUTPUT_FILE_PATH)

In [12]:
df.shape

(38622, 7)

In [14]:
df.country.value_counts()

russia    16072
canada    12566
china      9984
Name: country, dtype: int64

In [15]:
df.source_type.value_counts()

local            28977
international     9645
Name: source_type, dtype: int64

In [17]:
df.sample(3)

Unnamed: 0,date_publish,maintext,title,source_type,source_name,country,text
10521,2023-09-29 07:36:46,TORONTO Unifor has set Oct. 9 at 11:59 p.m. a...,Unifor sets Oct. 9 deadline for contract talks...,local,global-news,canada,"As a neutral news analyst, assess the sentimen..."
11585,2023-10-18 06:58:42,South Korea's Hanwha Ocean (042660.KS) has pit...,"Hanwha Ocean eyes submarine exports to Canada,...",international,reuters,canada,"As a neutral news analyst, assess the sentimen..."
13336,2021-10-23 00:00:00,"Chinese Vice Premier Sun Chunlan, also a membe...",Conference held in China to boost campus safety,local,china_daily,china,"As a neutral news analyst, assess the sentimen..."
