In [None]:
import pandas as pd
import random
import csv
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM # Import necessary modules
from google.colab import files


# Define sentiment-specific prompts for each group based on the content in the PDF
group_1_positive = [
    "Finnish Corporation Announces Expansion of Operations with Strong Capital Investment",
    "Stock Surge as Oyj Negotiations Lead to Major Equipment Deal",
    "Corporation Group Secures Funding for New Product Release in April",
    "Industrial Equipment Maker Reports Continued Growth Despite Market Challenges",
    "Finnish Firm Hel Announces Major Partnership with Capital Investors"
]
group_1_negative = [
    "Finnish Oyj Faces Delays in Operations Due to Failed Negotiations",
    "Stock Drops as Corporation Announces Major Equipment Recall",
    "Negotiations Break Down Between Capital Investors and Finnish Maker",
    "Industrial Exchange Struggles Amid Capital Shortages in February",
    "Group Concerned Over Declining Stock Prices Amid Market Uncertainty"
]
group_1_neutral = [
    "Corporation Group Discusses Stock Performance at Annual Meeting",
    "Oyj Announces New Equipment Release in April",
    "Negotiations Concerning Capital Investment to Continue Next Month",
    "Finnish Corporation to Review Operations Strategy in Monday Meeting",
    "Group to Evaluate Stock Performance After Recent Market Trends"
]
group_2_positive = [
    "EUR 500mn Profit Reported as Sales Increase in Latest Quarter",
    "Operating Profit Surges in EUR as Net Sales Jump 15% Year-over-Year",
    "EUR 200mn Net Profit Outpaces Expectations in a Strong Q1 Performance",
    "Sales and Profit Both Increase in Latest EUR Quarterly Report",
    "Company Reports EUR 300mn in Net Profit, Highlighting Strong Year-End Growth"
]

group_2_negative = [
    "EUR 100mn Net Loss Recorded Despite Strong Sales in Previous Quarter",
    "Operating Loss Increases as EUR Sales Fail to Offset Costs in Q3",
    "EUR Losses Total EUR 150mn Due to Weaker Than Expected Net Profit",
    "Negative Profit Growth and EUR Operating Declines Reported in Latest Period",
    "EUR 50mn Pretax Loss in Corresponding Period Marks a Weak Financial Quarter"
]

group_2_neutral = [
    "EUR Sales Steady at 500mn in Latest Operating Period",
    "Net Profit and Sales Totals in EUR Align with Corresponding Year",
    "Quarterly EUR Reports Released Showing No Significant Profit or Loss",
    "Company Maintains EUR 100mn in Net Profit Amid Stable Sales Growth",
    "Year-on-Year EUR Sales and Net Totals Report Steady Growth of 2%"
]
group_3_positive = [
    "Helsinki OMX Index Surges as Nokia Announces New Contract for Technology Services",
    "UPM Reports Strong Recovery in Media and Electronics Unit Following Industry Growth",
    "Share Prices Increase as Business Development Plan Exceeds Expectations in Q2",
    "New Investments Drive Growth as Power Units and Technology Services Expand in Europe",
    "MLN Euro Project Drives Up Manufacturing Output, Boosting Stora Enso’s Annual Growth"
]

group_3_negative = [
    "Nokia Shares Fall as Technology Contract Expected to Underperform in OMX Market",
    "Industry Decline Sees UPM Media Unit Suffer Decrease in Revenue Amid Global Cuts",
    "Share Prices Drop as Paper Sector Suffers from Lower End-of-Year Performance",
    "Fourth Quarter Losses Total EUR 300mn Due to High Operating Costs and Decreased Turnover",
    "Manufacturing Sector Declines in September as Investment Falls Short of Expectations"
]

group_3_neutral = [
    "Helsinki Companies Post Mixed Results in Annual OMX Index Report for Technology Sector",
    "Quarterly Management Report Shows Steady Performance in Media and Solutions Units",
    "MLN-Euro Project Completed by December as Expected, With No Major Impact on Shares",
    "Total Revenue Remains Unchanged Despite Increased Costs in Construction Sector",
    "Fourth Quarter Review Shows No Major Fluctuations in Share Prices or Earnings"
]
group_4_positive = [
    "Finnish Company Launches Innovative Mobile Products, Strengthening Market Position in the Baltic Region",
    "Energy Project in Finland Set to Boost Local Market, Creating Jobs and Driving Economic Growth",
    "Companies Based in the Baltic Region Report Record Profits Amid Strong Demand for Green Energy Solutions",
    "Finnish Energy Group Expands into Baltic Market, Strengthening Regional Energy Infrastructure",
    "Reports Show Mobile Product Sales in Finland Rise Sharply, Bolstering National Economy"
]

group_4_negative = [
    "Finnish Mobile Company Faces Declining Sales in Baltic Market, Struggling to Compete with Global Giants",
    "Energy Project in Finland Delayed Due to Regulatory Challenges, Affecting Investor Confidence",
    "Companies Based in the Baltic Region Report Losses as Economic Uncertainty Affects Market Performance",
    "Finnish Company to Scale Back Expansion Plans Amid Declining Demand for Local Products",
    "Mobile Product Launch in Finland Faces Backlash as Consumers Reject New Features"
]

group_4_neutral = [
    "Finnish Company Expands Product Line with New Mobile Offerings, Plans to Enter Baltic Markets",
    "Energy Project in Finland Moves Forward with Local Partnerships, Targeting Long-Term Market Growth",
    "Baltic Region Companies Consider Diversification Strategies Amid Changing Market Conditions",
    "According to Reports, Finnish Mobile Company Plans to Introduce New Products in Baltic Markets",
    "Companies in Finland and the Baltic Discuss Potential Collaborative Projects in Energy Sector"
]


# Similarly define other groups...
# For simplicity, we show only one group here; repeat similar definitions for other groups.

# Define the tokenizer and model here before using them in the functions
model_name = "t5-base"  # Or any other suitable pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Use the generate_synthetic_data function to create headlines for each sentiment and group
def generate_synthetic_data(sentiment, prompt_examples):
    # Select a random prompt based on sentiment
    prompt = random.choice(prompt_examples)

    # Tokenize the selected prompt
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    output_ids = model.generate(
        input_ids,
        max_length=30,            # Limiting length to concise headlines
        min_length=10,            # Minimum to avoid short, incomplete outputs
        do_sample=True,
        temperature=0.8,
        no_repeat_ngram_size=3,

        num_beams=5,              # Beam search to improve focus
        penalty_alpha=0.6,
        top_k=5,                  # Limit top-k for more targeted output
        repetition_penalty=1.8,   # Higher penalty to reduce redundancy
        early_stopping=True
    )

    # Decode the output
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output_text

# Function to generate headlines and save to CSV
def generate_and_save_headlines(group_name, positive_examples, negative_examples, neutral_examples, num_examples=15):
    headlines = []

    # Generate positive headlines
    for _ in range(num_examples):
        headline = generate_synthetic_data("positive", positive_examples)
        headlines.append([headline, "positive"])

    # Generate negative headlines
    for _ in range(num_examples):
        headline = generate_synthetic_data("negative", negative_examples)
        headlines.append([headline, "negative"])

    # Generate neutral headlines
    for _ in range(num_examples):
        headline = generate_synthetic_data("neutral", neutral_examples)
        headlines.append([headline, "neutral"])

    # Convert to DataFrame and save as CSV
    df = pd.DataFrame(headlines, columns=["headline", "sentiment"])
    print(df.head(5))  # Print first 5 rows to check
    csv_filename = f"{group_name}_synthetic_data.csv"
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"Saved {csv_filename}")

# Generate and save headlines for each group
generate_and_save_headlines("group_1", group_1_positive, group_1_negative, group_1_neutral)
generate_and_save_headlines("group_2", group_2_positive, group_2_negative, group_2_neutral)
generate_and_save_headlines("group_3", group_3_positive, group_3_negative, group_3_neutral)
generate_and_save_headlines("group_4", group_4_positive, group_4_negative, group_4_neutral)
# Repeat for other groups with similar calls

# Example:
# generate_and_save_headlines("group_2", group_2_positive, group_2_negative, group_2_neutral)


                                            headline sentiment
0  Funding for New Product Release in April New P...  positive
1  Hel Announces Major Partnership with Capital I...  positive
2  Reports Continued Growth Despite Market Challe...  positive
3  Funding for New Product Release in April New P...  positive
4  Hel Announces Major Partnership with Capital I...  positive
Saved group_1_synthetic_data.csv
                                            headline sentiment
0  EUR 300mn in Net Profit, Highlighting Strong Y...  positive
1  In Latest EUR Quarterly Report, Sales and Prof...  positive
2  Operating Profit Surges in EUR as Net Sales Ju...  positive
3  Operating Profit Surges in EUR as Net Sales Ju...  positive
4  EUR 300mn in Net Profit, Highlighting Strong Y...  positive
Saved group_2_synthetic_data.csv
                                            headline sentiment
0  As Power Units and Technology Services Expand ...  positive
1  As Share Prices Rise as Business Development P...