# Historical stock data fetch

In [1]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta

# Define the top 5 tech companies' stock tickers
tickers = ["AMZN","GOOG","T","ABT","CVS","AMGN"]

# Calculate the date range for the past 5 years
start_date = "2019-12-01"
end_date = datetime.today().strftime('%Y-%m-%d')

# Initialize an empty DataFrame to store all the data
all_data = pd.DataFrame(columns=["Date", "Adj Close", "Close", "High", "Low", "Open", "Volume", "Ticker"])
# Loop through each ticker to fetch data
for ticker in tickers:
    print(f"Fetching data for {ticker}...")
    data = yf.download(ticker, start=start_date, end=end_date, interval="1d")
    data['Ticker'] = ticker  # Add the ticker column
    data.reset_index(inplace=True)  # Reset the index to include the date column
    data.columns = ["Date","Adj Close", "Close", "High", "Low", "Open", "Volume", "Ticker"]
    if ticker in ["AMZN","GOOG","T"]:
        data["Industry"] = "Tech"
    else:
        data["Industry"] = "Health"
    all_data = pd.concat([data, all_data], ignore_index=True)


all_data.to_csv("/Users/amily/Desktop/stock.csv", index=False)
all_data



Fetching data for AMZN...


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Fetching data for GOOG...
Fetching data for T...


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Fetching data for ABT...
Fetching data for CVS...


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Fetching data for AMGN...


Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,Ticker,Industry
0,2019-12-02 00:00:00+00:00,200.422028,233.479996,235.800003,232.630005,234.539993,1633600,AMGN,Health
1,2019-12-03 00:00:00+00:00,199.812546,232.770004,233.179993,230.779999,231.479996,1825400,AMGN,Health
2,2019-12-04 00:00:00+00:00,200.765396,233.880005,234.889999,232.000000,232.729996,1908500,AMGN,Health
3,2019-12-05 00:00:00+00:00,200.361954,233.410004,233.770004,231.460007,233.690002,2083600,AMGN,Health
4,2019-12-06 00:00:00+00:00,200.670990,233.770004,235.000000,233.190002,234.679993,1860900,AMGN,Health
...,...,...,...,...,...,...,...,...,...
7735,2025-01-13 00:00:00+00:00,218.460007,218.460007,219.399994,216.470001,218.059998,27262700,AMZN,Tech
7736,2025-01-14 00:00:00+00:00,217.759995,217.759995,221.820007,216.199997,220.440002,24711700,AMZN,Tech
7737,2025-01-15 00:00:00+00:00,223.350006,223.350006,223.570007,220.750000,222.830002,31291300,AMZN,Tech
7738,2025-01-16 00:00:00+00:00,220.660004,220.660004,224.649994,220.309998,224.419998,24757300,AMZN,Tech


# Company data fetch

In [96]:
stock = yf.Ticker(ticker[0])
    
# Fetch and transpose the financial statements
bs = stock.balance_sheet.T
f = stock.financials.T
cf = stock.cashflow.T
    
# Ensure unique columns for each DataFrame
bs.columns = [f"BS_{col}" for col in bs.columns]
f.columns = [f"IS_{col}" for col in f.columns]
cf.columns = [f"CF_{col}" for col in cf.columns]
combined = pd.concat([bs, f, cf], axis=1)


In [97]:

# Initialize an empty DataFrame
company_info = pd.DataFrame(columns = combined.columns)



# Loop through each ticker
for i in tickers:
    stock = yf.Ticker(i)
    
    # Fetch and transpose the financial statements
    bs = stock.balance_sheet.T
    f = stock.financials.T
    cf = stock.cashflow.T
    
    # Ensure unique columns for each DataFrame
    bs.columns = [f"BS_{col}" for col in bs.columns]
    f.columns = [f"IS_{col}" for col in f.columns]
    cf.columns = [f"CF_{col}" for col in cf.columns]

    # Concatenate horizontally
    combined = pd.concat([bs, f, cf], axis=1)
    combined["Ticker"] = i  
    if i in ["AMZN","GOOG","T"]:
        combined["Industry"] = "Tech"
    else:
        combined["Industry"] = "Health"
    
    # Append to the master DataFrame
    company_info = pd.concat([company_info, combined], ignore_index=False)

company_info.reset_index(inplace = True)
company_info





## 2024

import yfinance as yf
import pandas as pd

# Initialize an empty DataFrame
company_info_2024 = pd.DataFrame(columns = combined.columns)


# Loop through each ticker
for i in tickers:
    stock = yf.Ticker(i)
    
    # Fetch and transpose the financial statements
    bs = stock.quarterly_balance_sheet.T
    f = stock.quarterly_financials.T
    cf = stock.quarterly_cashflow.T
    
    # Ensure unique columns for each DataFrame
    bs.columns = [f"BS_{col}" for col in bs.columns]
    f.columns = [f"IS_{col}" for col in f.columns]
    cf.columns = [f"CF_{col}" for col in cf.columns]
    
    # Concatenate horizontally
    combined = pd.concat([bs, f, cf], axis=1)
    combined["Ticker"] = i  
    if i in ["GOOGL", "AMZN", "T"]:
        combined["Industry"] = "Tech"
    else:
        combined["Industry"] = "Health"
    
    # Ensure the index is a datetime object
    combined.index = pd.to_datetime(combined.index)

    # Filter for rows where the year is 2024 and month > 5
    combined = combined[(combined.index.year == 2024) & (combined.index.month > 8)]
    
    # Append to the master DataFrame
    company_info_2024 = pd.concat([company_info_2024, combined], ignore_index=False)
company_info_2024.reset_index(inplace=True)
company_info_2024



company_info = pd.concat([company_info_2024, company_info])
company_info


  company_info.reset_index(inplace = True)
  company_info_2024.reset_index(inplace=True)


Unnamed: 0,index,BS_Treasury Shares Number,BS_Ordinary Shares Number,BS_Share Issued,BS_Net Debt,BS_Total Debt,BS_Tangible Book Value,BS_Invested Capital,BS_Working Capital,BS_Net Tangible Assets,...,BS_Held To Maturity Securities,BS_Assets Held For Sale Current,BS_Restricted Cash,IS_Gain On Sale Of Ppe,IS_Restructuring And Mergern Acquisition,BS_Line Of Credit,BS_Investmentsin Associatesat Cost,IS_Otherunder Preferred Stock Dividend,CF_Preferred Stock Dividend Paid,CF_Change In Prepaid Assets
0,2024-09-30,515000000.0,10511000000.0,11026000000.0,,134692000000.0,236070000000.0,314041000000.0,14315000000.0,236070000000.0,...,,,,,,,,,,
1,2024-09-30,,12264000000.0,12264000000.0,,26922000000.0,282184000000.0,324999000000.0,76738000000.0,282184000000.0,...,,,,,,,,,,
2,2024-09-30,446348901.0,7174399697.0,7620748598.0,126426000000.0,146343000000.0,-93471000000.0,231363000000.0,-10859000000.0,-93471000000.0,...,,,,,,,,,,
3,2024-09-30,256595476.0,1734455938.0,1991051414.0,7421000000.0,14979000000.0,8786000000.0,54775000000.0,8900000000.0,8786000000.0,...,,,,,,,,,,
4,2024-09-30,519000000.0,1258000000.0,1777000000.0,58659000000.0,82704000000.0,-44145000000.0,140478000000.0,-17284000000.0,-44145000000.0,...,1315000000.0,,,0.0,1169000000.0,,,,,
5,2024-09-30,,537500000.0,537500000.0,51387000000.0,60398000000.0,-40051000000.0,67925000000.0,6454000000.0,-40051000000.0,...,,,,,,,,,,
0,2023-12-31,515000000.0,10383000000.0,10898000000.0,,135611000000.0,171399000000.0,260189000000.0,7434000000.0,171399000000.0,...,,,,,,,,,,
1,2022-12-31,515000000.0,10242000000.0,10757000000.0,13262000000.0,140118000000.0,119658000000.0,213193000000.0,-8602000000.0,119658000000.0,...,,,,,,,,,,
2,2021-12-31,460000000.0,10180000000.0,10640000000.0,12524000000.0,116395000000.0,117767000000.0,186989000000.0,19314000000.0,117767000000.0,...,,,,,,,,,,
3,2020-12-31,480000000.0,10060000000.0,10540000000.0,,84389000000.0,73406000000.0,125220000000.0,6348000000.0,73406000000.0,...,,,,,,,,,,


In [98]:
# Calculate the number of missing values per column
missing_values = company_info.isnull().sum()

# Calculate the percentage of missing values per column
missing_percentage = (company_info.isnull().sum() / len(company_info)) * 100

# Combine the results into a single DataFrame for better readability
missing_summary = pd.DataFrame({
    "Missing Values": missing_values,
    "Missing Percentage (%)": missing_percentage
})

# Display the summary
print(missing_summary)


# Filter for columns with missing percentage greater than 90%
missing_high = missing_summary[missing_summary["Missing Percentage (%)"] ==0]

remain = company_info[missing_high.index]
year = []
month = []
for i in company_info["index"]:
    year.append(i.year)
    month.append(i.month)
remain["index"] = year 
remain.sort_values(by=["Ticker","index"], inplace= True)
remain.to_csv("/Users/amily/Desktop/company_info.csv")



                                        Missing Values  Missing Percentage (%)
index                                                0                0.000000
BS_Treasury Shares Number                            8               26.666667
BS_Ordinary Shares Number                            0                0.000000
BS_Share Issued                                      0                0.000000
BS_Net Debt                                          8               26.666667
...                                                ...                     ...
BS_Line Of Credit                                   27               90.000000
BS_Investmentsin Associatesat Cost                  27               90.000000
IS_Otherunder Preferred Stock Dividend              27               90.000000
CF_Preferred Stock Dividend Paid                    29               96.666667
CF_Change In Prepaid Assets                         26               86.666667

[258 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  remain["index"] = year
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [99]:
remain

Unnamed: 0,index,BS_Ordinary Shares Number,BS_Share Issued,BS_Total Debt,BS_Tangible Book Value,BS_Invested Capital,BS_Working Capital,BS_Net Tangible Assets,BS_Common Stock Equity,BS_Total Capitalization,...,CF_Operating Cash Flow,CF_Cash Flow From Continuing Operating Activities,CF_Change In Working Capital,CF_Change In Receivables,CF_Changes In Account Receivables,CF_Depreciation Amortization Depletion,CF_Depreciation And Amortization,CF_Net Income From Continuing Operations,Ticker,Industry
15,2020,1771230274.0,1981156896.0,19649000000.0,-5744000000.0,51531000000.0,8534000000.0,-5744000000.0,32784000000.0,51311000000.0,...,7901000000.0,7901000000.0,-892000000.0,-924000000.0,-924000000.0,3327000000.0,3327000000.0,4495000000.0,ABT,Health
14,2021,1764082193.0,1985273421.0,19006000000.0,-168000000.0,53852000000.0,11134000000.0,-168000000.0,35802000000.0,53098000000.0,...,10533000000.0,10533000000.0,-771000000.0,-383000000.0,-383000000.0,3538000000.0,3538000000.0,7071000000.0,ABT,Health
13,2022,1737795021.0,1986519278.0,17716000000.0,3433000000.0,53459000000.0,9735000000.0,3433000000.0,36686000000.0,51208000000.0,...,9581000000.0,9581000000.0,-1519000000.0,-68000000.0,-68000000.0,3267000000.0,3267000000.0,6933000000.0,ABT,Health
12,2023,1734076358.0,1987883852.0,15628000000.0,6109000000.0,53282000000.0,8829000000.0,6109000000.0,38603000000.0,52202000000.0,...,7261000000.0,7261000000.0,-2475000000.0,-356000000.0,-356000000.0,3243000000.0,3243000000.0,5723000000.0,ABT,Health
3,2024,1734455938.0,1991051414.0,14979000000.0,8786000000.0,54775000000.0,8900000000.0,8786000000.0,39796000000.0,52621000000.0,...,2705000000.0,2705000000.0,163000000.0,-57000000.0,-57000000.0,801000000.0,801000000.0,1646000000.0,ABT,Health
23,2020,578300000.0,578300000.0,32986000000.0,-21867000000.0,42395000000.0,9491000000.0,-21867000000.0,9409000000.0,42304000000.0,...,10497000000.0,10497000000.0,-216000000.0,-427000000.0,-427000000.0,3601000000.0,3601000000.0,7264000000.0,AMGN,Health
22,2021,558300000.0,558300000.0,33309000000.0,-23372000000.0,40009000000.0,7201000000.0,-23372000000.0,6700000000.0,39922000000.0,...,9261000000.0,9261000000.0,-1194000000.0,-429000000.0,-429000000.0,3398000000.0,3398000000.0,5893000000.0,AMGN,Health
21,2022,534000000.0,534000000.0,38945000000.0,-27948000000.0,42606000000.0,6499000000.0,-27948000000.0,3661000000.0,41015000000.0,...,9721000000.0,9721000000.0,-733000000.0,-746000000.0,-746000000.0,3417000000.0,3417000000.0,6552000000.0,AMGN,Health
20,2023,535400000.0,535400000.0,64613000000.0,-45038000000.0,70845000000.0,11940000000.0,-45038000000.0,6232000000.0,69402000000.0,...,8471000000.0,8471000000.0,-484000000.0,-1015000000.0,-1015000000.0,4071000000.0,4071000000.0,6717000000.0,AMGN,Health
5,2024,537500000.0,537500000.0,60398000000.0,-40051000000.0,67925000000.0,6454000000.0,-40051000000.0,7527000000.0,64381000000.0,...,3571000000.0,3571000000.0,856000000.0,-342000000.0,-342000000.0,1396000000.0,1396000000.0,2830000000.0,AMGN,Health
