In [27]:
import os
from dotenv import load_dotenv

# Dynamically get the path to the .env file for Jupyter/IPython
current_directory = os.getcwd()
env_path = os.path.join(current_directory, 'API_KEY.env')

# Load the .env file
load_dotenv(dotenv_path=env_path)

# Access API keys
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
NINJA_API_KEY = os.getenv("NINJA_API_KEY")
ALPHA_VANTAGE_API_KEY = os.getenv("ALPHA_VANTAGE_API_KEY")

In [47]:
import requests
import csv
import os
import time

# Replace with your actual API key
API_KEY = 'YOUR_API_KEY'
BASE_URL = 'https://api.api-ninjas.com/v1/earningstranscript'

# GICS Sector Tickers
tickers = [
    "AAPL", "MSFT",  # Information Technology
    "JNJ", "PFE",    # Healthcare
    "JPM", "BAC",    # Financials
    "AMZN", "TSLA",  # Consumer Discretionary
    "PG", "KO",      # Consumer Staples
    "XOM", "CVX",    # Energy
    "BA", "CAT",     # Industrials
    "LIN", "DOW",    # Materials
    "AMT", "SPG",    # Real Estate
    "NEE", "DUK",    # Utilities
    "GOOGL", "META"  # Communication Services
]

# Define the range of years and quarters
start_year = 2019
end_year = 2023

# CSV file to save transcripts
output_file = "gics_transcripts.csv"

# Initialize the CSV file
if not os.path.exists(output_file):
    with open(output_file, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["ticker", "year", "quarter", "transcript"])  # Header row

# Function to fetch and save transcripts
def fetch_and_save_transcripts():
    for ticker in tickers:
        for year in range(start_year, end_year + 1):
            for quarter in range(1, 5):
                params = {
                    'ticker': ticker,
                    'year': year,
                    'quarter': quarter
                }
                headers = {
                    'X-Api-Key': API_KEY
                }

                try:
                    response = requests.get(BASE_URL, headers=headers, params=params)
                    if response.status_code == 200:
                        data = response.json()
                        transcript = data.get("transcript", "")
                        if transcript:
                            # Save to CSV
                            with open(output_file, "a", newline="") as f:
                                writer = csv.writer(f)
                                writer.writerow([ticker, year, quarter, transcript])
                            print(f"Saved: {ticker} {year} Q{quarter}")
                        else:
                            print(f"No transcript available for {ticker} {year} Q{quarter}")
                    else:
                        print(f"Failed to fetch {ticker} {year} Q{quarter}: {response.status_code} - {response.text}")

                    time.sleep(1)  # Pause to respect API rate limits

                except Exception as e:
                    print(f"Error fetching {ticker} {year} Q{quarter}: {e}")

# Fetch and save transcripts
fetch_and_save_transcripts()


Saved: AAPL 2019 Q1
Saved: AAPL 2019 Q2
Saved: AAPL 2019 Q3
Saved: AAPL 2019 Q4
Saved: AAPL 2020 Q1
Saved: AAPL 2020 Q2
Saved: AAPL 2020 Q3
Saved: AAPL 2020 Q4
Saved: AAPL 2021 Q1
Saved: AAPL 2021 Q2
Saved: AAPL 2021 Q3
Saved: AAPL 2021 Q4
Saved: AAPL 2022 Q1
Saved: AAPL 2022 Q2
Saved: AAPL 2022 Q3
Saved: AAPL 2022 Q4
Saved: AAPL 2023 Q1
Saved: AAPL 2023 Q2
Saved: AAPL 2023 Q3
Saved: AAPL 2023 Q4
Saved: MSFT 2019 Q1
Saved: MSFT 2019 Q2
Saved: MSFT 2019 Q3
Saved: MSFT 2019 Q4
Saved: MSFT 2020 Q1
Saved: MSFT 2020 Q2
Saved: MSFT 2020 Q3
Saved: MSFT 2020 Q4
Saved: MSFT 2021 Q1
Saved: MSFT 2021 Q2
Saved: MSFT 2021 Q3
Saved: MSFT 2021 Q4
Saved: MSFT 2022 Q1
Saved: MSFT 2022 Q2
Saved: MSFT 2022 Q3
Saved: MSFT 2022 Q4
Saved: MSFT 2023 Q1
Saved: MSFT 2023 Q2
Saved: MSFT 2023 Q3
Saved: MSFT 2023 Q4
Saved: JNJ 2019 Q1
Saved: JNJ 2019 Q2
Saved: JNJ 2019 Q3
Saved: JNJ 2019 Q4
Saved: JNJ 2020 Q1
Saved: JNJ 2020 Q2
Saved: JNJ 2020 Q3
Saved: JNJ 2020 Q4
Saved: JNJ 2021 Q1
Saved: JNJ 2021 Q2
Saved: JNJ

In [1]:
import re
import pandas as pd

def clean_transcript_with_titles(text):
    """
    Cleans a transcript by removing names while preserving titles and other essential information.
    """
    if not isinstance(text, str) or text.strip() == "":
        return ""  # Return empty string for invalid input

    # Step 1: Start from "CEO"
    match = re.search(r"CEO:.*", text, flags=re.IGNORECASE | re.DOTALL)
    text = match.group(0) if match else text

    # Step 2: Remove ending formalities
    text = re.sub(r"(a replay of today.*?available.*?|members of the press.*?|financial analysts.*?|thanks again.*?|that does conclude.*?today\.).*", "", text, flags=re.IGNORECASE | re.DOTALL)

    # Step 3: Remove names while keeping titles
    text = re.sub(r"\b([A-Z][a-z]+ [A-Z][a-z]+)\b(,| -| - | is)?\s*(CEO|CFO|COO|Chairman|President|VP|Director|Manager|Analyst|Consultant)", r"\3", text, flags=re.IGNORECASE)

    # Step 4: Remove common formalities
    patterns_to_remove = [
        r"\b(operator|thank you|good (morning|afternoon|evening|day)|please note.*)\b",  # Formal greetings
        r"\bwe are excited\b",  # Overly promotional phrases
        r"\bcan you repeat\b",  # Redundant operator exchanges
    ]
    for pattern in patterns_to_remove:
        text = re.sub(pattern, "", text, flags=re.IGNORECASE)

    # Step 5: Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text

# Load the transcripts dataset
file_path = "gics_transcripts.csv"  # Replace with your actual file path
try:
    transcripts_df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"File {file_path} not found. Ensure the file exists.")
    raise

# Apply the cleaning function
transcripts_df['cleaned_transcript'] = transcripts_df['transcript'].apply(clean_transcript_with_titles)

# Save the cleaned transcripts
output_file = "cleaned_transcripts_final.csv"
transcripts_df.to_csv(output_file, index=False)

# Display a sample of the cleaned data
print(transcripts_df[['transcript', 'cleaned_transcript']].head(10))


                                          transcript  \
0  Operator: Good day and welcome to the Apple In...   
1  Operator: Good day, and welcome to the Apple I...   
2  Operator: Good day, and welcome to the Apple I...   
3  Operator: Good day, everyone. Welcome to the A...   
4  Operator: Good day, everyone. Welcome to the A...   
5  Operator: Good day everyone. Welcome to the Ap...   
6  Operator: Good day, everyone. Welcome to the A...   
7  Operator: Good day everyone and welcome to the...   
8  Operator: Good day, and welcome to the Apple Q...   
9  Operator: Good day, and welcome to the Apple Q...   

                                  cleaned_transcript  
0  : and welcome to the Apple Incorporated First ...  
1  : , and welcome to the Apple Incorporated Seco...  
2  : , and welcome to the Apple Incorporated Thir...  
3  : , everyone. Welcome to the Apple Incorporate...  
4  : , everyone. Welcome to the Apple Incorporate...  
5  : everyone. Welcome to the Apple Incorporated ... 

In [None]:
import pandas as pd
import requests
import json
import pandas as pd
import re
import tiktoken
import time
import os
import openai
import requests


API_KEY 


headers = {
    "Authorization": f"Bearer {API_KEY}",
    "Content-Type": "application/json"
}

OPENAI_API_URL = "https://api.openai.com/v1/chat/completions"

# Function to analyze sentiment
def analyze_sentiment_with_openai(text):
    headers = {"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"}
    try:
        payload = {
            "model": "gpt-3.5-turbo",
            "messages": [
                {"role": "system", "content": "You are a sentiment analysis assistant."},
                {"role": "user", "content": f"Analyze the sentiment of this text and provide weights for positive, neutral, and negative sentiment in JSON format. Text: {text}"}
            
            ]
        }
        response = requests.post(OPENAI_API_URL, headers=headers, json=payload)
        response.raise_for_status()
        result = response.json()
        return result['choices'][0]['message']['content'].strip()
    except Exception as e:
        print(f"Error analyzing sentiment: {e}")
        return {"positive": 0, "neutral": 0, "negative": 0}  # Default weights if error occurs


# Load the dataset
file_path = "cleaned_transcripts_final.csv"  # Update with the correct path to your file
try:
    transcripts_df = pd.read_csv(file_path)
    print("File loaded successfully.")
except FileNotFoundError:
    print(f"File {file_path} not found.")
    raise

# Perform sentiment analysis for each transcript
print("Starting sentiment analysis...")
transcripts_df['sentiment'] = transcripts_df['cleaned_transcript'].apply(analyze_sentiment_with_openai)

# Extract sentiment weights into separate columns
transcripts_df['positive_weight'] = transcripts_df['sentiment'].apply(lambda x: x.get('positive', 0))
transcripts_df['neutral_weight'] = transcripts_df['sentiment'].apply(lambda x: x.get('neutral', 0))
transcripts_df['negative_weight'] = transcripts_df['sentiment'].apply(lambda x: x.get('negative', 0))

# Save the results
output_file = "final_sentimental_analysis.csv"
transcripts_df.to_csv(output_file, index=False)
print(f"Sentiment analysis completed. Results saved to {output_file}.")

# Display a sample of the processed data
print(transcripts_df[['cleaned_transcript', 'positive_weight', 'neutral_weight', 'negative_weight']].head())

In [3]:
# Let's load the final_sentimental_analysis.csv file and check its information and missing/zero values.
import pandas as pd

# Load the file
file_path = "final_sentimental_analysis.csv"
final_sentimental_analysis = pd.read_csv(file_path)

# Display the file info
file_info = final_sentimental_analysis.info()

# Check for missing or zero values in the Positive, Neutral, and Negative columns
missing_values = final_sentimental_analysis[['Positive', 'Neutral', 'Negative']].isnull().sum()
zero_values = (final_sentimental_analysis[['Positive', 'Neutral', 'Negative']] == 0).sum()

file_info, missing_values, zero_values


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ticker              440 non-null    object 
 1   year                440 non-null    int64  
 2   quarter             440 non-null    int64  
 3   transcript          440 non-null    object 
 4   cleaned_transcript  440 non-null    object 
 5   Transcript Number   440 non-null    int64  
 6   Positive            440 non-null    float64
 7   Neutral             440 non-null    float64
 8   Negative            440 non-null    float64
dtypes: float64(3), int64(3), object(3)
memory usage: 31.1+ KB


(None,
 Positive    0
 Neutral     0
 Negative    0
 dtype: int64,
 Positive      0
 Neutral       0
 Negative    133
 dtype: int64)