## Label Articles as 'Fact' or 'Opinion'

In [3]:
import pandas as pd
import numpy as np
import os
import json
from datetime import datetime, timedelta

import requests
import re
import openai
from google.cloud import bigquery
from bs4 import BeautifulSoup
from newspaper import Article, Config

from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains import SimpleSequentialChain

from apikey import apikey_news_source
from apikey import apikey_openai


In [4]:
all_articles_df = pd.read_csv(r'C:\Users\samir\OneDrive\Desktop\News Stock Relevance Project\all_articles.csv')

In [5]:
all_articles_df = all_articles_df.dropna(subset=['Description', 'Content'])

In [6]:
max_prompt_length = 4097

In [7]:
all_articles_df = all_articles_df.head(10).reset_index(drop=True)  # REMOVE IN PRODUCTION

In [8]:
# Define the headers you want to use
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'

# Create a configuration object
config = Config()
config.browser_user_agent = user_agent
config.request_timeout = 15 

# Classify Each Article as Fact or Opinion

# Set up OpenAI API credentials
openai.api_key = apikey_openai

# Iterate through each article
for index, row in all_articles_df.iterrows():
    link = row['URL']

    # Use Newspaper3k to parse the article
    article = Article(link, config=config)
    article.download()
    article.parse()
    full_content = article.text

    prompt_content = f"Title: {row['Title']}  Description: {row['Description']}  Full Content: {full_content}"
    
    # Prompt 2: Classify the article based on its full contents
    prompt = f"Do not mention any other details, please classify full contents of this article as only 'Fact' or 'Opinion':\n\n{prompt_content}"
        
        
    # Make API request to OpenAI API for classification
    response2 = openai.Completion.create(
    engine='text-davinci-003',
    prompt=prompt[:max_prompt_length],
    max_tokens=800,  #Adjust as needed
    temperature=0,  # Adjust as needed
    n=1  # Adjust as needed
    )
    classification = response2.choices[0].text.strip().lower()
    all_articles_df.loc[index, 'Full Content'] = prompt_content
    all_articles_df.loc[index, 'Classification'] = classification

In [9]:
for i, row in all_articles_df.iterrows():
    text = re.sub(r'\W+', ' ', row['Classification'])
    words = text.split()  # Split the text into words
    if 'Fact' in words or 'fact' in words :
        all_articles_df.at[i,'Classification'] = "Fact"
    elif 'Opinion' in words or 'opinion' in words :
        all_articles_df.at[i, 'Classification'] = "Opinion"

In [10]:
all_articles_df

Unnamed: 0,Source,Author,Title,Description,URL,Published At,Content,Full Content,Classification
0,The Washington Post,Dan Stillman,D.C.-area forecast: Sunshine and some smoke to...,Shower and storm chances return late Friday an...,https://www.washingtonpost.com/weather/2023/06...,2023-06-28T12:00:00Z,Comment on this story\r\nComment\r\n* Code Ora...,D.C.-area forecast: Sunshine and some smoke to...,Fact
1,CBS News,Emmet Lyons,Kevin Spacey's U.K. trial on sexual assault ch...,Oscar-winning actor Kevin Spacey faces a dozen...,https://www.cbsnews.com/news/kevin-spacey-uk-t...,2023-06-28T11:34:00Z,London – Kevin Spacey's trial began Wednesday ...,Kevin Spacey's U.K. trial on sexual assault ch...,Fact
2,CNN,Julia Buckley,Air passenger gets plane all to himself after ...,Everyone else made other plans when the flight...,https://www.cnn.com/travel/solo-air-passenger-...,2023-06-28T11:19:00Z,If youve ever thought that having an empty sea...,Air passenger gets plane all to himself after ...,Fact
3,WABC-TV,,Travel woes worsen for passengers stranded at ...,Hundreds of flights are already canceled as of...,https://abc7ny.com/flight-cancellations-lga-ne...,2023-06-28T09:11:15Z,"EAST ELMHURST, Queens (WABC) -- Hundreds of fl...",Travel woes worsen for passengers stranded at ...,Fact
4,DW (English),Deutsche Welle,South Koreans become a year or two younger ove...,A new law has scrapped South Korea's tradition...,https://www.dw.com/en/south-koreans-become-a-y...,2023-06-28T08:48:52Z,People in South Korea woke up on Wednesday to ...,South Koreans become a year or two younger ove...,Fact
5,SciTechDaily,,Intermittent Fasting vs. Calorie Counting: Wha...,A study by the University of Illinois Chicago ...,https://scitechdaily.com/?p=288129,2023-06-28T08:33:24Z,"ByAmerican College of PhysiciansJune 28, 2023\...",Intermittent Fasting vs. Calorie Counting: Wha...,Fact
6,ABC News,ÉDGAR H. CLEMENTE Associated Press,Gunmen abduct 14 state police officers in the ...,Armed men have abducted 14 state police office...,https://abcnews.go.com/International/wireStory...,2023-06-28T08:04:26Z,MEXICO CITY -- Armed men abducted 14 state pol...,Gunmen abduct 14 state police officers in the ...,Fact
7,CBS Sports,,"2023 MLB Mock Draft: LSU teammates, College Wo...","The 2023 draft begins Sunday, July 9",https://www.cbssports.com/mlb/news/2023-mlb-mo...,2023-06-28T06:32:00Z,The 2023 MLB Draft is a little more than two w...,"2023 MLB Mock Draft: LSU teammates, College Wo...",Fact
8,The Athletic,"Nicole Auerbach, Bruce Feldman, Jeff Zrebiec","Ryan Mallett, former Arkansas star and NFL QB,...",A Michigan source confirmed to The Athletic th...,https://theathletic.com/4646091/2023/06/27/rya...,2023-06-28T05:31:31Z,"By Nicole Auerbach, Bruce Feldman and Jeff Zre...","Ryan Mallett, former Arkansas star and NFL QB,...",Fact
9,ESPN,Alden Gonzalez,"Shohei Ohtani hits 2 HRs, K's 10 in start cut ...",Shohei Ohtani hit two home runs and struck out...,https://www.espn.com/mlb/story/_/id/37925744/s...,2023-06-28T05:09:00Z,"ANAHEIM, Calif. -- The diagnosis blared throug...","Shohei Ohtani hits 2 HRs, K's 10 in start cut ...",Fact


In [11]:
all_articles_df.to_csv("articles.csv", index=False)