## Get Sector Relevance Data from News Articles

In [16]:
import pandas as pd
import requests
import re
import openai

from apikey import apikey_news_source
from apikey import apikey_openai

In [17]:
max_prompt_length = 4097

In [18]:
# Import user's watchlist
watchlist = pd.read_csv(r'C:\Users\samir\OneDrive\Desktop\News Stock Relevance Project\User data\watchlist.csv')

# Import dataframe with stocks and micro-sectors
stocks_data = pd.read_csv(r'C:\Users\samir\OneDrive\Desktop\News Stock Relevance Project\data\full stock list.csv')

In [19]:
# Merge user watchlist with stock data
merged_data = pd.merge(watchlist, stocks_data, on='Stock', how='inner')

# Extract micro-sectors from merged dataframe
micro_sectors = merged_data['Micro-Sectors'].tolist()

# Remove duplicate micro-sectors
unique_micro_sectors = list(set([sector.strip() for sectors in micro_sectors for sector in sectors.split(',')]))

# Print the list of unique micro-sectors
#print("Unique Micro-Sectors in User's Watchlist:")
#for sector in unique_micro_sectors:
#    print(sector)

In [20]:
# Create an empty DataFrame to store the news articles
articles_df = pd.DataFrame(columns=['Micro-Sector', 'Source', 'Author', 'Title', 'Description', 'URL', 'Published At', 'Content'])

# Iterate over unique micro-sectors
for sector in unique_micro_sectors:
    # Format the API URL for the specific micro-sector
    #api_url = f"https://newsapi.org/v2/everything?q={sector}&apiKey={apikey_news_source}"
    api_url = f"https://newsapi.org/v2/everything?q={sector}&from=2023-06-02&to=2023-05-18&apiKey={apikey_news_source}"
    
    # Make the API request
    response = requests.get(api_url)
    
    # Process the response and extract relevant information
    articles = response.json().get('articles')
    if articles:
        # Extract article information and add micro-sector column
        articles_data = [(sector, article.get('source').get('name'), article.get('author'), 
                          article.get('title'), article.get('description'), 
                          article.get('url'), article.get('publishedAt'), article.get('content'))  for article in articles]
        
        # Convert the list of articles to a DataFrame
        articles_sector_df = pd.DataFrame(articles_data, columns=['Micro-Sector', 'Source', 'Author', 'Title',
                                                                   'Description', 'URL', 'Published At', 'Content'])
        
        # Append the articles for the current micro-sector to the main DataFrame
        articles_df = pd.concat([articles_df, articles_sector_df], ignore_index=True)

In [21]:
articles_df = articles_df.tail(50)

In [22]:
# Set up OpenAI API credentials
openai.api_key = apikey_openai

# Iterate through each article
for index, article in articles_df.iterrows():
    # Extract the content to analyze
    content = f"{article['Title']} {article['Description']} {article['Content']}"
    micro_sector = article['Micro-Sector']
    
    # Prompt 1: Classify if the article is fact or opinion and assign a confidence score (1-5)
    prompt1 = f"Please classify the following article as 'Fact' or 'Opinion' and assign a confidence score (1-5):\n\n{content}"
    
    # Make API request to OpenAI API for classification
    response1 = openai.Completion.create(
        engine='text-davinci-003',
        prompt=prompt1,
        max_tokens=50,  # Adjust as needed
        temperature=0.5,  # Adjust as needed
        n=1  # Adjust as needed
    )
    
    # Parse the response and extract the classification and confidence score
    classification = response1.choices[0].text.strip().lower()
    confidence_score = int(''.join([token for token in response1.choices[0].text if token.isdigit()]))
    
    # Prompt 2: If confidence score is less than 4, classify the article based on its full contents
    if confidence_score < 4:
        link = article['URL']
        
        # Retrieve the full contents of the article from the provided link
        # Fetch the full contents of the article from the link
        response = requests.get(link)
        if response.status_code == 200:
            full_content = response.text
        else:
            full_content = ""
    else:
        full_content = content
        
        
    # Update the content variable with the full contents of the article if available
    #if full_content:
    content = f"{article['Title']} {article['Description']} {full_content}"
        
    # Prompt 2: Classify the article based on its full contents
    prompt2 = f"Please classify the following article based on its full contents as 'Fact' or 'Opinion':\n\n{full_content}"
        
    # Make API request to OpenAI API for classification
    response2 = openai.Completion.create(
    engine='text-davinci-003',
    prompt=prompt2[:max_prompt_length],
    max_tokens=50,  # Adjust as needed
    temperature=0.5,  # Adjust as needed
    n=1  # Adjust as needed
    )
        
    # Parse the response and extract the classification
    classification = response2.choices[0].text.strip().lower()
    
    # Prompt 3: Rank the relevance of the content to the micro-sector from an investment perspective
    prompt3 = f"Micro-Sector: {micro_sector}\nContent: {content}\nPlease rank the relevance of the content to the micro-sector from an investment perspective on a scale of 1 to 10."
    
    # Make API request to OpenAI API for relevance analysis
    response3 = openai.Completion.create(
        engine='text-davinci-003',
        prompt=prompt3[:max_prompt_length],
        max_tokens=50,  # Adjust as needed
        temperature=0.5,  # Adjust as needed
        n=1  # Adjust as needed
    )

   # Parse the response and extract the relevance score
    relevance_text = response3.choices[0].text.strip()
    relevance_score_match = re.search(r'\b(\d+)\b', relevance_text)

    if relevance_score_match:
        relevance_score = int(relevance_score_match.group())
        # Ensure the relevance score is within the range of 1 to 10
        relevance_score = max(1, min(10, relevance_score))
    else:
        # Handle the case when no relevance score is found
        relevance_score = 0  # or any default value you prefer
    
    # Parse the response and extract the relevance score
    #relevance_score = ''.join([token for token in response3.choices[0].text if token.isdigit()])
    print(relevance_score)
    
    # Assign the relevance score, classification, and confidence score to the corresponding micro-sector or update the DataFrame
    
    # Update the relevance score, classification, and confidence score in the DataFrame
    articles_df.loc[index, 'Relevance Score'] = relevance_score
    articles_df.loc[index, 'Classification'] = classification
    articles_df.loc[index, 'Confidence Score'] = confidence_score
    
    # Print the updated DataFrame with relevance scores
print("News Articles DataFrame with Relevance Scores, Classification, and Confidence Scores:")
print(articles_df)

9
8
6
9
9
8
9
9
9
9
8
9
9
8
8
8
7
8
9
9
7
9
8
9
8
8
9
8
9
9
9
8
8
9
8
8
9
9
9
7
9
8
8
7
7
8
9
7
9
8
News Articles DataFrame with Relevance Scores, Classification, and Confidence Scores:
           Micro-Sector                        Source  \
1475  Electric Vehicles                      Autoblog   
1476  Electric Vehicles                      Autoblog   
1477  Electric Vehicles                  The Guardian   
1478  Electric Vehicles                 Science Daily   
1479  Electric Vehicles                Digital Trends   
1480  Electric Vehicles                      Autoblog   
1481  Electric Vehicles                      Autoblog   
1482  Electric Vehicles                      Autoblog   
1483  Electric Vehicles                      Autoblog   
1484  Electric Vehicles                      Autoblog   
1485  Electric Vehicles                      Autoblog   
1486  Electric Vehicles                      Autoblog   
1487  Electric Vehicles                      Autoblog   
1488  Electric V

In [None]:
# Set up OpenAI API credentials
openai.api_key = apikey_openai

# Iterate through each article
for index, article in articles_df.iterrows():
    # Extract the content to analyze
    content = f"{article['Title']} {article['Description']} {article['Content']}"
    micro_sector = article['Micro-Sector']
    
    # Define the prompt using the micro-sector
    prompt = f"Micro-Sector: {micro_sector}\nContent: {content}\nAct as a Financial and Investment Advisor and using a very high standard of relevance from 1 to 10 please rate the relevance of the content to the micro-sector from an investment perspective. Please do not provide any text explanation of the relevance, please only respond with an integer between 1 to 10 to reflect a high standard of relevance of the content and the micro-sector"
    
    # Make API request to OpenAI API for relevance analysis
    response = openai.Completion.create(
        engine='text-davinci-003',
        prompt=prompt,
        max_tokens=50,  # Adjust as needed
        temperature=0.5,  # Adjust as needed
        n=1  # Adjust as needed
    )
    
    # Parse the response and extract the relevance score or other relevant information
    relevance_score = ''.join([token for token in response.choices[0].text if token.isdigit()])
    #relevance_score = response.choices[0].text
    print(relevance_score)
    
    # Assign the relevance score to the corresponding micro-sector or update the DataFrame
    
    # Update the relevance score in the DataFrame
    articles_df.loc[index, 'Relevance Score'] = relevance_score

# Print the updated DataFrame with relevance scores
print("News Articles DataFrame with Relevance Scores:")
print(articles_df)

In [23]:
articles_df

Unnamed: 0,Micro-Sector,Source,Author,Title,Description,URL,Published At,Content,Relevance Score,Classification,Confidence Score
1475,Electric Vehicles,Autoblog,Reuters,Senate Republicans urge EPA to drop plan to cu...,"Filed under:\n Government/Legal,Green,Electric...",https://www.autoblog.com/2023/05/25/senate-rep...,2023-05-25T16:51:00Z,WASHINGTON A group of 27 Senate Republicans on...,9.0,fact.,5.0
1476,Electric Vehicles,Autoblog,Associated Press,Toyota to invest $2.1 billion more in N.C. bat...,"Filed under:\n Green,Plants/Manufacturing,Toyo...",https://www.autoblog.com/2023/05/31/toyota-to-...,2023-05-31T16:15:00Z,"New Toyota President Koji Sato, left, and his ...",8.0,fact.,5.0
1477,Electric Vehicles,The Guardian,Guardian Staff,"On the climate crisis, we can’t afford to look...",Michael Penney and Mike Swinn respond to Gaby ...,https://www.theguardian.com/environment/2023/m...,2023-05-26T17:02:02Z,Gaby Hinsliff is right to draw attention to th...,6.0,opinion,4.0
1478,Electric Vehicles,Science Daily,,Metal shortage could put the brakes on electri...,As more and more electric cars are traveling o...,https://www.sciencedaily.com/releases/2023/05/...,2023-05-31T19:02:00Z,As more and more electric cars are travelling ...,9.0,fact,4.0
1479,Electric Vehicles,Digital Trends,Trevor Mogg,Waymo’s robotaxis are coming to Uber’s ridesha...,"In a partnership announced on Tuesday, Uber wi...",https://www.digitaltrends.com/cars/waymos-robo...,2023-05-24T02:00:43Z,Uber will soon offer rides in Waymo’s autonomo...,9.0,fact.,5.0
1480,Electric Vehicles,Autoblog,Jonathon Ramsey,2024 VW ID. Buzz debuts June 2 in Huntington B...,"Filed under:\n Green,Volkswagen,Minivan/Van,El...",https://www.autoblog.com/2023/05/19/2024-vw-id...,2023-05-19T13:10:00Z,"In February, we got spy shots of a practically...",8.0,fact.,5.0
1481,Electric Vehicles,Autoblog,James Riswick,"Here's $32,000. Which small SUV would you buy?","Filed under:\n Car Buying,Crossover,SUV,Electr...",https://www.autoblog.com/2023/05/25/what-would...,2023-05-25T14:00:00Z,After stepping into our DeLoreans last week to...,9.0,fact.,4.0
1482,Electric Vehicles,Autoblog,Jonathon Ramsey,Volvo EX30 interior is minimalism with clever ...,"Filed under:\n Green,Volvo,Crossover,Electric,...",https://www.autoblog.com/2023/05/31/volvo-ex30...,2023-05-31T22:14:00Z,Volvo's given us another piece of the battery-...,9.0,fact.,4.0
1483,Electric Vehicles,Autoblog,Reuters,Nasdaq warns truck maker Nikola it's about to ...,"Filed under:\n Green,Commercial Vehicles,Elect...",https://www.autoblog.com/2023/05/25/nasdaq-war...,2023-05-25T17:27:00Z,Nikola Corp said on Thursday that Nasdaq gave ...,9.0,fact.,5.0
1484,Electric Vehicles,Autoblog,Byron Hurd,Two high-performance Corvettes caught testing ...,"Filed under:\n Rumormill,Spy Photos,Chevrolet,...",https://www.autoblog.com/2023/05/23/two-high-p...,2023-05-23T21:51:00Z,A caravan of heavily camouflaged high-performa...,9.0,fact.,5.0


In [None]:
articles_df.to_csv("article_classify.csv", index=False)