Importing the necessary libraries

In [44]:
import pandas as pd
import datetime as dt
from datetime import datetime
import pandas_market_calendars as mcal
import requests
from bs4 import BeautifulSoup
import re
import csv

For each sector, creating two dictionaries: one with ticker as key and stock timeseries dataframe as value, other one with ticker as key and company name as value. 

Sector to ticker mapping data has been obtained using web scraping from https://www.stockmonitor.com/sectors/

In [45]:
def create_dfs(ticker_csv):
    df_dict = {}
    ticker_cmpny = {}
    ticker = pd.read_csv(ticker_csv) 
    for ticker,company_name in zip(ticker['Ticker'],ticker['Company_Name']):
        filename = "Data/Stocks/" + ticker.lower() + ".us.txt"
        ticker_name = ticker
        compny_name = company_name
        try:
            compny_df = pd.read_csv(filename)
            compny_df.set_index('Date',inplace = True)
            compny_df.index = pd.to_datetime(compny_df.index)
            df_dict[ticker_name] = compny_df
            ticker_cmpny[ticker_name] = compny_name
        except FileNotFoundError:
            pass
        except EmptyDataError:
            print ("file " + filename + " is empty !!!")
            
    return (df_dict,ticker_cmpny)

"""    
communication_sector_dfs, = create_dfs("communication_service_ticker.csv")
materials_sector_dfs = create_dfs("materials_ticker.csv")
consumer_cyclical_dfs = create_dfs("consumer_cyclical_ticker.csv")
consumer_defensive_dfs = create_dfs("consumer_defensive_ticker.csv")
energy_sector_dfs = create_dfs("energy_ticker.csv")
financial_service_dfs = create_dfs("financial_ticker.csv")
healthcare_sector_dfs = create_dfs("healthcare_ticker.csv")
industrial_sector_dfs=create_dfs("industrial_ticker.csv")
technology_sector_dfs=create_dfs("technology_ticker.csv")
utility_sector_dfs=create_dfs("utility_ticker.csv")
"""

consumer_cyclical_dfs, ticker_cmpny_dict = create_dfs("consumer_cyclical_ticker.csv")

print(consumer_cyclical_dfs, ticker_cmpny_dict)

{'AAN':               Open    High     Low   Close   Volume  OpenInt
Date                                                        
2005-02-25  12.920  13.470  12.920  13.295   933743        0
2005-02-28  13.371  13.470  13.118  13.381   683911        0
2005-03-01  13.371  13.794  13.342  13.734   624585        0
2005-03-02  13.607  14.038  13.568  13.822   387289        0
2005-03-03  13.891  13.891  13.450  13.627   362061        0
...            ...     ...     ...     ...      ...      ...
2017-11-06  36.280  36.570  35.570  35.590  1029506        0
2017-11-07  35.720  35.720  34.290  34.440  1149793        0
2017-11-08  35.510  36.090  34.800  35.990  1805497        0
2017-11-09  35.680  35.860  35.230  35.620   979951        0
2017-11-10  37.600  38.060  36.260  36.590  1214968        0

[3201 rows x 6 columns], 'ANF':               Open    High      Low   Close   Volume  OpenInt
Date                                                         
2005-02-25  44.731  44.928  44.3330  44.71

Cleaning the data by finding out the missing days in stock market time series. We have compared the the time series dates with New York Stock Exchange (NYSE) open dates. If any company has more than passed in missing days limit, we have dropped the company for further analysis, otherwise we have merged the time series dataframe with the NYSE data frame using right join and finally impute the missing days with 'ffill' method. 



In [46]:
def extract_missing_days(df1, df2, missing_days_dict):
            missing_days_list = []
            for missing_days in df1.index.difference(df2.index):
                    #print (missing_days)
                    missing_days_list.append(missing_days)
            missing_days_dict[keys] = missing_days_list
            #print (missing_days_dict)

def probe_missing_data(df, missing_days_limit, missing_days_dict=None):
    list_missing_data_gt_limit = []
    nyse = mcal.get_calendar('NYSE')
    for keys in df:
        start_date = pd.to_datetime(df[keys].index.min())
        end_date = pd.to_datetime(df[keys].index.max())
        
        #create a df for nyse open days with date as index
        nyse_open_days = nyse.schedule(start_date=start_date.strftime('%Y-%m-%d'), end_date=end_date.strftime('%Y-%m-%d'))
        date_index = nyse_open_days['market_open'].index 
        df_nyse = pd.DataFrame()
        df_nyse.index = date_index
        
        #checking if nyse open days equal to the stock price time series 
        if len(nyse_open_days) == df[keys].shape[0]:
            continue
        else:
            if missing_days_dict:
                extract_missing_days(df_nyse, df[keys], missing_days_dict)
            if len(nyse_open_days) - df[keys].shape[0] >= missing_days_limit:
                list_missing_data_gt_limit.append(keys)
            else:
                merged_df = df[keys].merge(df_nyse, how = 'right', right_index = True, left_index = True)
                merged_df.fillna(method = 'ffill',inplace = True)
                #print (merged_df.isna().sum())
                df[keys] = merged_df
                #print (len(nyse_open_days),df[keys].shape[0] )
    
    #print(len(list_missing_data))
    for ele in list_missing_data_gt_limit:
        del df[ele]
    return df

probe_missing_data(consumer_cyclical_dfs, 100)
      

{'AAN':               Open    High     Low   Close     Volume  OpenInt
 2005-02-25  12.920  13.470  12.920  13.295   933743.0      0.0
 2005-02-28  13.371  13.470  13.118  13.381   683911.0      0.0
 2005-03-01  13.371  13.794  13.342  13.734   624585.0      0.0
 2005-03-02  13.607  14.038  13.568  13.822   387289.0      0.0
 2005-03-03  13.891  13.891  13.450  13.627   362061.0      0.0
 ...            ...     ...     ...     ...        ...      ...
 2017-11-06  36.280  36.570  35.570  35.590  1029506.0      0.0
 2017-11-07  35.720  35.720  34.290  34.440  1149793.0      0.0
 2017-11-08  35.510  36.090  34.800  35.990  1805497.0      0.0
 2017-11-09  35.680  35.860  35.230  35.620   979951.0      0.0
 2017-11-10  37.600  38.060  36.260  36.590  1214968.0      0.0
 
 [3202 rows x 6 columns],
 'ANF':               Open    High      Low   Close     Volume  OpenInt
 2005-02-25  44.731  44.928  44.3330  44.712   883668.0      0.0
 2005-02-28  44.722  44.936  43.8890  44.274  1057227.0     

Filtering data by years, deleting 'OpenInt' column since it has no information and adding a new column that computes daily returns

In [47]:
def filter_data_by_year(df_dict,start_year,end_year):
    new_df = {}
    for key in df_dict:
        df_start_year = df_dict[key].index.year.min()
        df_end_year = df_dict[key].index.year.max()
        if start_year >= df_start_year:
            if end_year <= df_end_year:
                new_df[key] = df_dict[key].loc[pd.to_datetime(str(start_year)):pd.to_datetime(str(end_year + 1))]
    return (new_df)
            
def data_modified_cols(new_df):
    for keys in new_df:
        new_df[keys].drop(columns = 'OpenInt',axis = 1)
        new_df[keys]['Daily_Returns'] = new_df[keys]['Close'].pct_change()
        

filter_data_by_year(consumer_cyclical_dfs, 2005,2017)


{'AAN':               Open    High     Low   Close     Volume  OpenInt
 2005-02-25  12.920  13.470  12.920  13.295   933743.0      0.0
 2005-02-28  13.371  13.470  13.118  13.381   683911.0      0.0
 2005-03-01  13.371  13.794  13.342  13.734   624585.0      0.0
 2005-03-02  13.607  14.038  13.568  13.822   387289.0      0.0
 2005-03-03  13.891  13.891  13.450  13.627   362061.0      0.0
 ...            ...     ...     ...     ...        ...      ...
 2017-11-06  36.280  36.570  35.570  35.590  1029506.0      0.0
 2017-11-07  35.720  35.720  34.290  34.440  1149793.0      0.0
 2017-11-08  35.510  36.090  34.800  35.990  1805497.0      0.0
 2017-11-09  35.680  35.860  35.230  35.620   979951.0      0.0
 2017-11-10  37.600  38.060  36.260  36.590  1214968.0      0.0
 
 [3202 rows x 6 columns],
 'ANF':               Open    High      Low   Close     Volume  OpenInt
 2005-02-25  44.731  44.928  44.3330  44.712   883668.0      0.0
 2005-02-28  44.722  44.936  43.8890  44.274  1057227.0     

Pulling revenue data for each company from https://www.macrotrends.net/stocks/ using web scraping 
and creating a dictionary with ticker name as key and revenue data as value.  

In [54]:
from urllib.error import HTTPError
def fetchRevenueData(ticker_csv):
    
    stock_df, cmpny_name = create_dfs(ticker_csv)
    stock_df = probe_missing_data(stock_df, 100)
    #stock_df = filter_data_by_year(stock_df,start_year,end_year)
    ticker_revenue_dicts = {}
    
    for keys in stock_df:
        ticker_name = keys
        company_name = cmpny_name[keys]
        #print (ticker_name,company_name)
        res = re.split(' |, ', company_name)
        macrotrends_name = ''
        for elem in res:
            if not macrotrends_name:
                macrotrends_name = elem.lower()
            else:
                macrotrends_name = macrotrends_name + "-" + elem.lower()
        #test if the ticker name combo works on Macrotrends
                macrotrends_url = "https://www.macrotrends.net/stocks/charts/{}/{}/revenue".format(ticker_name, macrotrends_name)
                x = requests.head(macrotrends_url)
                if (x.status_code != 404):
                    # fetch revenue data using read_html()
                    try:
                        revenue = pd.read_html(macrotrends_url , match = 'Quarterly Revenue', flavor = 'bs4')[0]
                        ticker_revenue_dicts[ticker_name] = revenue
                        #with open('ticker_revenue.csv', 'w') as f:
                             #f.write("%s,%s\n"%(key,ticker_revenue_dicts[key]))  
                    except HTTPError as e:
                        print ("Error while retriving URL \"" + macrotrends_url + "\": " + str(e))
                        pass
                
    return (ticker_revenue_dicts)

fetchRevenueData("consumer_cyclical_ticker.csv")

Error while retriving URL "https://www.macrotrends.net/stocks/charts/GES/guess'-inc/revenue": HTTP Error 301: The HTTP server returned a redirect error that would lead to an infinite loop.
The last 30x error message was:
Moved Permanently


{'AAN':   Aaron's Quarterly Revenue(Millions of US $)  \
 0                                  2021-09-30   
 1                                  2021-06-30   
 2                                  2021-03-31   
 3                                  2020-12-31   
 4                                  2020-09-30   
 5                                  2020-06-30   
 6                                  2020-03-31   
 7                                  2019-12-31   
 8                                  2018-12-31   
 9                                  2017-12-31   
 
   Aaron's Quarterly Revenue(Millions of US $).1  
 0                                          $452  
 1                                          $467  
 2                                          $481  
 3                                          $430  
 4                                          $441  
 5                                          $431  
 6                                          $433  
 7                               