# Data Scraping
Retrieve Stock Data from Yahoo Finance
+ company list comes from NASDAQ.com
+ no. of IT firms: 617

In [None]:
# !pip install pandas-datareader
# !pip install fix-yahoo-finance

In [1]:
import pandas as pd
from pandas_datareader import data as pdr
import fix_yahoo_finance
import os

# Section 1: Read in list of stock company details

In [64]:
# get list of companies to scrape from Yahoo Finance
stock_symbols = pd.read_csv('companylist.csv')
list_of_stocks = list(stock_symbols['Symbol'])

path = './stock_data/'

# Section 2: Scrape stock data from Yahoo Finance

In [None]:
# create folder to store retrieved stock data
if((os.path.exists('stock_data'))==False):
    os.mkdir(path)

# scraping code for Yahoo Finance
dont_exist = 0
counter = 0
for symbol in list_of_stocks:
    print("beginning...")
    counter = counter + 1
    if (counter % 50 ==0):
        print("still running... currently at "+ str(counter) + "th row, " + symbol)
    
    try:
        data = pdr.get_data_yahoo(symbol, start='2016-02-18', end='2019-02-22') # returns 1 additional day before start_date
        data.to_csv(path+symbol+'.csv')
    except Exception:
        dont_exist = dont_exist + 1
        print(str(counter) + " " + symbol + " does not exist")
        pass

print("end")
print(str(dont_exist)+" stock symbols do not exist") # 7

In [76]:
# Look at items in the folder
import glob
file_paths = glob.glob(path + '*.csv' )
len(file_paths)

filtered_list = []

for f in file_paths:
    filtered_list.append( (f.split("\\")[1]).split(".")[0] )
    

print(len(filtered_list))
# print(filtered_list)

610


# Section 3: Data Cleaning
Remove csv files whose company does not have 3 years' worth of stock data

+ Date Range: 2016-02-17 to 2019-02-22
+ NASDAQ has an average of 253 trading days per year. Therefore 253 * 3 + 1 leap day in 2016 = 760 rows per company

In [90]:
remained = 0
removed = 0

for symbol in filtered_list:
    curr = pd.read_csv(path+symbol+'.csv')
    if (curr.shape[0] != 760): # 1 trading year has an average of 253 trading days per year. 253 * 3 + 1 leap day in 2016 = 760
        removed = removed + 1
        os.remove(path+symbol+'.csv')
    else:
        remained = remained + 1
    
print(str(removed)+" companies removed")
print(str(remained)+ " companies remaining")

121 companies removed
489 companies remaining


In [93]:
# Look at items in the folder. update list_of_stocks
file_paths = glob.glob(path + '*.csv' )
len(file_paths)

filtered_list = []

for f in file_paths:
    filtered_list.append( (f.split("\\")[1]).split(".")[0] )
    
print(len(filtered_list))
# print(filtered_list)

489


### Export remaining list of stock company symbols
This list will be used in the 'Stock Prediction Model' jupyter notebook

In [94]:
# write the updated stock company list into a csv file for easy future access
file = open('updated_companylist.csv','w')

for each in filtered_list:
    file.write(each)
    file.write('\n')

file.close()
print("done")

done
