## Label Articles as 'Fact' or 'Opinion'

In [None]:
import pandas as pd
import numpy as np
import os
import json
from datetime import datetime, timedelta

import requests
import re
import openai
from google.cloud import bigquery
from bs4 import BeautifulSoup
from newspaper import Article, Config, ArticleException

from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains import SimpleSequentialChain

from apikey import apikey_news_source
from apikey import apikey_openai


In [None]:
all_articles_df = pd.read_csv(r'C:\Users\samir\OneDrive\Desktop\News Stock Relevance Project\all_articles.csv')

In [None]:
all_articles_df = all_articles_df.dropna(subset=['Description', 'Content'])

In [None]:
max_prompt_length = 4097

In [None]:
all_articles_df = all_articles_df.head(10).reset_index(drop=True)  # REMOVE IN PRODUCTION

In [None]:
# Define the headers you want to use
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'

# Create a configuration object
config = Config()
config.browser_user_agent = user_agent
config.request_timeout = 15 

# Set up OpenAI API credentials
openai.api_key = apikey_openai

# Iterate through each article
for index, row in all_articles_df.iterrows():
    print(index)
    link = row['URL']

    # Use Newspaper3k to parse the article
    article = Article(link, config=config)
    try:
        article.download()
        article.parse()
        full_content = article.text
        prompt_content = f"Title: {row['Title']}  Description: {row['Description']}  Full Content: {full_content}"
        
        # Prompt 2: Classify the article based on its full contents
        prompt = f"Do not mention any other details, please classify full contents of this article as only 'Fact' or 'Opinion':\n\n{prompt_content}"
        
        
        # Make API request to OpenAI API for classification
        response = openai.Completion.create(
        engine='text-davinci-003',
        prompt=prompt[:max_prompt_length],
        max_tokens=800,  #Adjust as needed
        temperature=0,  # Adjust as needed
        n=1  # Adjust as needed
        )
        
        classification = response.choices[0].text.strip().lower()
        all_articles_df.loc[index, 'Full Content'] = prompt_content
        all_articles_df.loc[index, 'Classification'] = classification
        
    except ArticleException as e:
        print(f"Skipping article at {link} due to error: {str(e)}")


In [None]:
for i, row in all_articles_df.iterrows():
    text = re.sub(r'\W+', ' ', row['Classification'])
    words = text.split()  # Split the text into words
    if 'Fact' in words or 'fact' in words :
        all_articles_df.at[i,'Classification'] = "Fact"
    elif 'Opinion' in words or 'opinion' in words :
        all_articles_df.at[i, 'Classification'] = "Opinion"

In [None]:
all_articles_df

In [None]:
all_articles_df.to_csv("articles.csv", index=False)