In [None]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta

# Define the top 5 tech companies' stock tickers
tickers = ["GOOGL", "AMZN","META", "UNH","JNJ","PFE"]

# Calculate the date range for the past 5 years
start_date = "2019-12-01"
end_date = datetime.today().strftime('%Y-%m-%d')

# Initialize an empty DataFrame to store all the data
all_data = pd.DataFrame(columns=["Date", "Adj Close", "Close", "High", "Low", "Open", "Volume", "Ticker"])
# Loop through each ticker to fetch data
for ticker in tickers:
    print(f"Fetching data for {ticker}...")
    data = yf.download(ticker, start=start_date, end=end_date, interval="1d")
    data['Ticker'] = ticker  # Add the ticker column
    data.reset_index(inplace=True)  # Reset the index to include the date column
    data.columns = ["Date","Adj Close", "Close", "High", "Low", "Open", "Volume", "Ticker"]
    if ticker in ["GOOGL", "AMZN","META"]:
        data["Industry"] = "Tech"
    else:
        data["Industry"] = "Health"
    all_data = pd.concat([data, all_data], ignore_index=True)


all_data.to_csv("/Users/amily/Desktop/stock.csv", index=False)
all_data



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Fetching data for GOOGL...
Fetching data for AMZN...
Fetching data for META...
Fetching data for UNH...
Fetching data for JNJ...



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Fetching data for PFE...


Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,Ticker,Industry
0,2019-12-02 00:00:00+00:00,29.327114,36.328274,36.812145,36.290321,36.802658,15645998,PFE,Health
1,2019-12-03 00:00:00+00:00,29.143291,36.100571,36.242886,35.901329,36.015179,20869832,PFE,Health
2,2019-12-04 00:00:00+00:00,29.212223,36.185959,36.299809,35.948769,36.271347,14501248,PFE,Health
3,2019-12-05 00:00:00+00:00,29.127970,36.081593,36.404175,35.853889,36.404175,13098058,PFE,Health
4,2019-12-06 00:00:00+00:00,29.327114,36.328274,36.546490,36.261860,36.318787,12868708,PFE,Health
...,...,...,...,...,...,...,...,...,...
7699,2025-01-02 00:00:00+00:00,189.429993,189.429993,192.000000,187.500000,190.649994,20370800,GOOGL,Tech
7700,2025-01-03 00:00:00+00:00,191.789993,191.789993,193.210007,189.979996,191.369995,18596200,GOOGL,Tech
7701,2025-01-06 00:00:00+00:00,196.869995,196.869995,198.220001,193.850006,193.979996,29563600,GOOGL,Tech
7702,2025-01-07 00:00:00+00:00,195.490005,195.490005,201.000000,194.600006,197.110001,26487200,GOOGL,Tech


In [3]:
stock = yf.Ticker("AAPL")
    
# Fetch and transpose the financial statements
bs = stock.balance_sheet.T
f = stock.financials.T
cf = stock.cashflow.T
    
# Ensure unique columns for each DataFrame
bs.columns = [f"BS_{col}" for col in bs.columns]
f.columns = [f"IS_{col}" for col in f.columns]
cf.columns = [f"CF_{col}" for col in cf.columns]
combined = pd.concat([bs, f, cf], axis=1)


In [4]:

# Initialize an empty DataFrame
company_info = pd.DataFrame(columns = combined.columns)



# Loop through each ticker
for i in tickers:
    stock = yf.Ticker(i)
    
    # Fetch and transpose the financial statements
    bs = stock.balance_sheet.T
    f = stock.financials.T
    cf = stock.cashflow.T
    
    # Ensure unique columns for each DataFrame
    bs.columns = [f"BS_{col}" for col in bs.columns]
    f.columns = [f"IS_{col}" for col in f.columns]
    cf.columns = [f"CF_{col}" for col in cf.columns]

    # Concatenate horizontally
    combined = pd.concat([bs, f, cf], axis=1)
    combined["Ticker"] = i  
    if i in ["GOOGL", "AMZN","META"]:
        combined["Industry"] = "Tech"
    else:
        combined["Industry"] = "Health"
    
    # Append to the master DataFrame
    company_info = pd.concat([company_info, combined], ignore_index=False)

company_info.reset_index(inplace = True)
company_info





## 2024

import yfinance as yf
import pandas as pd

# Initialize an empty DataFrame
company_info_2024 = pd.DataFrame(columns = combined.columns)

# List of tickers
tickers = ["GOOGL", "AMZN", "META", "UNH", "JNJ", "PFE"]

# Loop through each ticker
for i in tickers:
    stock = yf.Ticker(i)
    
    # Fetch and transpose the financial statements
    bs = stock.quarterly_balance_sheet.T
    f = stock.quarterly_financials.T
    cf = stock.quarterly_cashflow.T
    
    # Ensure unique columns for each DataFrame
    bs.columns = [f"BS_{col}" for col in bs.columns]
    f.columns = [f"IS_{col}" for col in f.columns]
    cf.columns = [f"CF_{col}" for col in cf.columns]
    
    # Concatenate horizontally
    combined = pd.concat([bs, f, cf], axis=1)
    combined["Ticker"] = i  
    if i in ["GOOGL", "AMZN", "META"]:
        combined["Industry"] = "Tech"
    else:
        combined["Industry"] = "Health"
    
    # Ensure the index is a datetime object
    combined.index = pd.to_datetime(combined.index)

    # Filter for rows where the year is 2024 and month > 5
    combined = combined[(combined.index.year == 2024) & (combined.index.month > 8)]
    
    # Append to the master DataFrame
    company_info_2024 = pd.concat([company_info_2024, combined], ignore_index=False)
company_info_2024.reset_index(inplace=True)
company_info_2024



company_info = pd.concat([company_info_2024, company_info])
company_info


  company_info.reset_index(inplace = True)
  company_info_2024.reset_index(inplace=True)


Unnamed: 0,index,BS_Treasury Shares Number,BS_Ordinary Shares Number,BS_Share Issued,BS_Net Debt,BS_Total Debt,BS_Tangible Book Value,BS_Invested Capital,BS_Working Capital,BS_Net Tangible Assets,...,CF_Gain Loss On Sale Of Business,BS_Liabilities Heldfor Sale Non Current,BS_Prepaid Assets,IS_Salaries And Wages,CF_Provisionand Write Offof Assets,CF_Sale Of Business,BS_Other Payable,IS_Otherunder Preferred Stock Dividend,CF_Unrealized Gain Loss On Investment Securities,BS_Preferred Securities Outside Stock Equity
0,2024-09-30,,12264000000.0,12264000000.0,,26922000000.0,282184000000.0,324999000000.0,76738000000.0,282184000000.0,...,,,,,,,,,,
1,2024-09-30,515000000.0,10511000000.0,11026000000.0,,134692000000.0,236070000000.0,314041000000.0,14315000000.0,236070000000.0,...,,,,,,,,,,
2,2024-09-30,,2524000000.0,2524000000.0,,49047000000.0,143875000000.0,193352000000.0,57737000000.0,143875000000.0,...,,,,,,,,,,
3,2024-09-30,,923000000.0,923000000.0,45610000000.0,78010000000.0,-35037000000.0,172545000000.0,-9307000000.0,-35037000000.0,...,20000000.0,,,,,,,,,
4,2024-09-30,712545000.0,2407298000.0,3119843000.0,15771000000.0,35751000000.0,-14131000000.0,105909000000.0,1490000000.0,-14131000000.0,...,,,,-200000000.0,-11000000.0,,,,,
5,2024-09-30,3926000000.0,5666000000.0,9592000000.0,65525000000.0,66617000000.0,-36270000000.0,158903000000.0,12000000.0,-36270000000.0,...,,,,,,0.0,,,,
0,2023-12-31,0.0,12460000000.0,12460000000.0,,28504000000.0,254181000000.0,295249000000.0,89716000000.0,254181000000.0,...,,,,,,,,,,
1,2022-12-31,,12849000000.0,12849000000.0,,29679000000.0,227184000000.0,269001000000.0,95495000000.0,227184000000.0,...,,,,,,,,,,
2,2021-12-31,,13242420000.0,13242420000.0,,28395000000.0,227262000000.0,264479000000.0,123889000000.0,227262000000.0,...,,,,,,,397000000.0,,,
3,2020-12-31,,13504440000.0,13504440000.0,,26772000000.0,199924000000.0,235376000000.0,117462000000.0,199924000000.0,...,,,,,,,754000000.0,,,


In [5]:
# Calculate the number of missing values per column
missing_values = company_info.isnull().sum()

# Calculate the percentage of missing values per column
missing_percentage = (company_info.isnull().sum() / len(company_info)) * 100

# Combine the results into a single DataFrame for better readability
missing_summary = pd.DataFrame({
    "Missing Values": missing_values,
    "Missing Percentage (%)": missing_percentage
})

# Display the summary
print(missing_summary)


# Filter for columns with missing percentage greater than 90%
missing_high = missing_summary[missing_summary["Missing Percentage (%)"] ==0]

remain = company_info[missing_high.index]
year = []
month = []
for i in company_info["index"]:
    year.append(i.year)
    month.append(i.month)
remain["index"] = year 
remain.sort_values(by=["Ticker","index"], inplace= True)
remain.to_csv("/Users/amily/Desktop/company_info.csv")



                                                  Missing Values  \
index                                                          0   
BS_Treasury Shares Number                                     13   
BS_Ordinary Shares Number                                      0   
BS_Share Issued                                                0   
BS_Net Debt                                                   13   
...                                                          ...   
CF_Sale Of Business                                           29   
BS_Other Payable                                              26   
IS_Otherunder Preferred Stock Dividend                        29   
CF_Unrealized Gain Loss On Investment Securities              27   
BS_Preferred Securities Outside Stock Equity                  29   

                                                  Missing Percentage (%)  
index                                                           0.000000  
BS_Treasury Shares Number        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  remain["index"] = year
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [6]:
remain

Unnamed: 0,index,BS_Ordinary Shares Number,BS_Share Issued,BS_Total Debt,BS_Tangible Book Value,BS_Invested Capital,BS_Working Capital,BS_Net Tangible Assets,BS_Common Stock Equity,BS_Total Capitalization,...,CF_Cash Flow From Continuing Operating Activities,CF_Change In Working Capital,CF_Stock Based Compensation,CF_Deferred Tax,CF_Deferred Income Tax,CF_Depreciation Amortization Depletion,CF_Depreciation And Amortization,CF_Net Income From Continuing Operations,Ticker,Industry
7,2020,10060000000.0,10540000000.0,84389000000.0,73406000000.0,125220000000.0,6348000000.0,73406000000.0,93404000000.0,125220000000.0,...,66064000000.0,13481000000.0,9208000000.0,-554000000.0,-554000000.0,25180000000.0,25180000000.0,21331000000.0,AMZN,Tech
6,2021,10180000000.0,10640000000.0,116395000000.0,117767000000.0,186989000000.0,19314000000.0,117767000000.0,138245000000.0,186989000000.0,...,46327000000.0,-19611000000.0,12757000000.0,-310000000.0,-310000000.0,34433000000.0,34433000000.0,33364000000.0,AMZN,Tech
5,2022,10242000000.0,10757000000.0,140118000000.0,119658000000.0,213193000000.0,-8602000000.0,119658000000.0,146043000000.0,213193000000.0,...,46752000000.0,-20886000000.0,19621000000.0,-8148000000.0,-8148000000.0,41921000000.0,41921000000.0,-2722000000.0,AMZN,Tech
4,2023,10383000000.0,10898000000.0,135611000000.0,171399000000.0,260189000000.0,7434000000.0,171399000000.0,201875000000.0,260189000000.0,...,84946000000.0,-11541000000.0,24023000000.0,-5876000000.0,-5876000000.0,48663000000.0,48663000000.0,30425000000.0,AMZN,Tech
1,2024,10511000000.0,11026000000.0,134692000000.0,236070000000.0,314041000000.0,14315000000.0,236070000000.0,259151000000.0,314041000000.0,...,25971000000.0,-6674000000.0,5333000000.0,-1317000000.0,-1317000000.0,13442000000.0,13442000000.0,15328000000.0,AMZN,Tech
3,2020,13504440000.0,13504440000.0,26772000000.0,199924000000.0,235376000000.0,117462000000.0,199924000000.0,222544000000.0,235376000000.0,...,65124000000.0,1827000000.0,12991000000.0,1390000000.0,1390000000.0,13697000000.0,13697000000.0,40269000000.0,GOOGL,Tech
2,2021,13242420000.0,13242420000.0,28395000000.0,227262000000.0,264479000000.0,123889000000.0,227262000000.0,251635000000.0,264479000000.0,...,91652000000.0,-1523000000.0,15376000000.0,1808000000.0,1808000000.0,12441000000.0,12441000000.0,76033000000.0,GOOGL,Tech
1,2022,12849000000.0,12849000000.0,29679000000.0,227184000000.0,269001000000.0,95495000000.0,227184000000.0,256144000000.0,269001000000.0,...,91495000000.0,-2235000000.0,19362000000.0,-8081000000.0,-8081000000.0,13475000000.0,13475000000.0,59972000000.0,GOOGL,Tech
0,2023,12460000000.0,12460000000.0,28504000000.0,254181000000.0,295249000000.0,89716000000.0,254181000000.0,283379000000.0,295249000000.0,...,101746000000.0,-3845000000.0,22460000000.0,-7763000000.0,-7763000000.0,11946000000.0,11946000000.0,73795000000.0,GOOGL,Tech
0,2024,12264000000.0,12264000000.0,26922000000.0,282184000000.0,324999000000.0,76738000000.0,282184000000.0,314119000000.0,324999000000.0,...,30698000000.0,-3789000000.0,5846000000.0,-1071000000.0,-1071000000.0,3985000000.0,3985000000.0,26301000000.0,GOOGL,Tech
