# Capstone: Data Collection

## Datasets:
- 10-K from SEC
- Stock price from Yahoo Finance
- list of S&P500 tickers

In [1]:
# importing libraries
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# web scraping libraries
import requests
import time
import random
from bs4 import BeautifulSoup
import re

# library used to scrape the SEC for 10-k annual reports
from edgar import Company, TXTML, Edgar

#library for historical stock prices
import yfinance as yf
from datetime import timedelta
from dateutil.relativedelta import relativedelta

## Data collection

In [2]:
# loading dataset of S&P500 companies
# source: https://github.com/datasets/s-and-p-500-companies/blob/master/data/constituents.csv
df_sp = pd.read_csv('./datasets/S&P500.csv')

In [3]:
df_sp.head()

Unnamed: 0,Symbol,Name,Sector
0,MMM,3M Company,Industrials
1,AOS,A.O. Smith Corp,Industrials
2,ABT,Abbott Laboratories,Health Care
3,ABBV,AbbVie Inc.,Health Care
4,ABMD,ABIOMED Inc,Health Care


In [4]:
df_sp.columns = [x.lower() for x in df_sp.columns]

In [5]:
df_sp.head()

Unnamed: 0,symbol,name,sector
0,MMM,3M Company,Industrials
1,AOS,A.O. Smith Corp,Industrials
2,ABT,Abbott Laboratories,Health Care
3,ABBV,AbbVie Inc.,Health Care
4,ABMD,ABIOMED Inc,Health Care


In [6]:
len(df_sp)

505

In [7]:
# Checking for Companies that issue different classes of shares
# removing them as their 10-k would be the same
df_sp[df_sp['name'].str.contains('Class')]

Unnamed: 0,symbol,name,sector
24,GOOGL,Alphabet Inc. (Class A),Communication Services
25,GOOG,Alphabet Inc. (Class C),Communication Services
144,DISCA,Discovery Inc. (Class A),Communication Services
145,DISCK,Discovery Inc. (Class C),Communication Services
200,FOXA,Fox Corporation (Class A),Communication Services
201,FOX,Fox Corporation (Class B),Communication Services
333,NWSA,News Corp. Class A,Communication Services
334,NWS,News Corp. Class B,Communication Services
457,UAA,Under Armour (Class A),Consumer Discretionary
458,UA,Under Armour (Class C),Consumer Discretionary


In [8]:
# Rows to drop
# the SEC is not consistent with the ticker names,
rows_to_drop = [24,145,201,334,458]

In [9]:
# dropping in place
df_sp.drop(rows_to_drop, inplace=True)

In [10]:
# Reconciling the difference in length
drop = df_sp[(df_sp['name']=='Berkshire Hathaway') |
      (df_sp['name']=='Brown-Forman Corp.') | 
      (df_sp['name']=='CBRE Group') | 
      (df_sp['name']=='Evergy') | 
      (df_sp['name']=='First Republic Bank') | 
      (df_sp['name']=='ViacomCBS') |
      (df_sp['name']=='Ingersoll Rand')]

In [11]:
# index of rows to drop
drop.index

Int64Index([66, 78, 90, 175, 189, 246, 474], dtype='int64')

In [12]:
#remove companies not in the list
df_sp.drop(drop.index, inplace=True)

In [13]:
df_sp.shape

(493, 3)

In [14]:
# Scraping through the sec to get the CIK and name of each company
# using the ticker. names do not match the proper sec database names

list_name = []
list_cik = []

for x in df_sp['symbol']:
    url = 'https://www.sec.gov/cgi-bin/browse-edgar?CIK=' + str(x) + '&type=10-K&dateb=&owner=include&count=40'
    res = requests.get(url, headers={'User-agent': 'Pony Inc 1.0'})
    res.status_code
    soup = BeautifulSoup(res.text, 'lxml')    
            
    for row in soup.find_all('span', {'class':'companyName'}):
        list_name.append(row.text.split(' CIK#: ')[0])
        list_cik.append(row.text.split(' CIK#: ')[1][:10])

In [15]:
# Reconciling the difference in length
# Due to companies delisting or merging etc.
print(len(df_sp))
print(len(list_name))
print(len(list_cik))

493
493
493


In [16]:
df_sp.shape

(493, 3)

In [17]:
# appending list_name and list_cik to df_sp
df_sp['name_clean'] = list_name
df_sp['cik'] = list_cik

df_sp.head()

Unnamed: 0,symbol,name,sector,name_clean,cik
0,MMM,3M Company,Industrials,3M CO,66740
1,AOS,A.O. Smith Corp,Industrials,SMITH A O CORP,91142
2,ABT,Abbott Laboratories,Health Care,ABBOTT LABORATORIES,1800
3,ABBV,AbbVie Inc.,Health Care,AbbVie Inc.,1551152
4,ABMD,ABIOMED Inc,Health Care,ABIOMED INC,815094


In [18]:
#checking the shape of the new df_sp
df_sp.shape

(493, 5)

## Scraping the dates of each 10-K

In [19]:
#Scraping through the sec to get each date of 10-K filed for each company
ticker = []
dates = []


for x in df_sp['symbol']:
    url = 'https://www.sec.gov/cgi-bin/browse-edgar?CIK=' + str(x) + '&type=10-K&dateb=&owner=include&count=40'
    res = requests.get(url, headers={'User-agent': 'Pony Inc 1.0'})
    res.status_code
    soup = BeautifulSoup(res.text, 'lxml')
    
    table = soup.find('table', {'class':'tableFile2'})
    rows= table.find_all('tr')[1:]
    i = 0
    for row in rows:
        if len(rows) < 6:
            break
        elif len(rows) >= 6:    
            cells = row.find_all('td')
            filing = {}

            if i < 6:
                for cell in cells[3]:
                    ticker.append(x)
                    dates.append(cell)
                    i += 1
            elif i > 6:
                break

In [20]:
# getting rid of companies without 6 annual reports on the SEC
df_sp = df_sp[df_sp['symbol'].isin(np.unique(ticker))]

In [21]:
#checking the shape of df_sp
df_sp.shape

(468, 5)

In [22]:
#checking the first few rows of df_sp
df_sp.head()

Unnamed: 0,symbol,name,sector,name_clean,cik
0,MMM,3M Company,Industrials,3M CO,66740
1,AOS,A.O. Smith Corp,Industrials,SMITH A O CORP,91142
2,ABT,Abbott Laboratories,Health Care,ABBOTT LABORATORIES,1800
3,ABBV,AbbVie Inc.,Health Care,AbbVie Inc.,1551152
4,ABMD,ABIOMED Inc,Health Care,ABIOMED INC,815094


In [23]:
#checking first 10 tickers
ticker[:10]

['MMM', 'MMM', 'MMM', 'MMM', 'MMM', 'MMM', 'AOS', 'AOS', 'AOS', 'AOS']

In [24]:
#checking first 10 dates
dates[:10]

['2020-02-06',
 '2019-02-07',
 '2018-02-08',
 '2017-02-09',
 '2016-02-11',
 '2015-02-12',
 '2020-02-24',
 '2019-02-15',
 '2018-02-16',
 '2017-02-17']

In [25]:
df_sp_1 = df_sp[:int(len(df_sp)/2)]
df_sp_2 = df_sp[int(len(df_sp)/2):]

## Scraping 10-K reports from the SEC

In [26]:
#scraping the SEC for the 10-K of each company
reports_10K = []
reports_name = []

for name, cik in zip(df_sp_1['name_clean'], df_sp_1['cik']):
    company = Company(name, cik)
    docs = company.get_10Ks(no_of_documents=6)
    #appending a blank row because the latest 2020 report will not be used
    reports_10K.append('')
    reports_name.append(name)
    
    # generate a random sleep duration to look more 'natural'
    sleep_duration = random.randint(15,30)
    print(sleep_duration)
    time.sleep(sleep_duration) #program sleeps for the sleep_duration
    
    for doc in docs[1:]:
        text = TXTML.parse_full_10K(doc)    
        reports_10K.append(text)
        reports_name.append(name)
        print(name)

24
3M CO
3M CO
3M CO
3M CO
3M CO
20
SMITH A O CORP
SMITH A O CORP
SMITH A O CORP
SMITH A O CORP
SMITH A O CORP
26
ABBOTT LABORATORIES
ABBOTT LABORATORIES
ABBOTT LABORATORIES
ABBOTT LABORATORIES
ABBOTT LABORATORIES
18
AbbVie Inc.
AbbVie Inc.
AbbVie Inc.
AbbVie Inc.
AbbVie Inc.
24
ABIOMED INC
ABIOMED INC
ABIOMED INC
ABIOMED INC
ABIOMED INC
19
Accenture plc
Accenture plc
Accenture plc
Accenture plc
Accenture plc
26
Activision Blizzard, Inc.
Activision Blizzard, Inc.
Activision Blizzard, Inc.
Activision Blizzard, Inc.
Activision Blizzard, Inc.
28
ADOBE INC.
ADOBE INC.
ADOBE INC.
ADOBE INC.
ADOBE INC.
18
ADVANCE AUTO PARTS INC
ADVANCE AUTO PARTS INC
ADVANCE AUTO PARTS INC
ADVANCE AUTO PARTS INC
ADVANCE AUTO PARTS INC
26
ADVANCED MICRO DEVICES INC
ADVANCED MICRO DEVICES INC
ADVANCED MICRO DEVICES INC
ADVANCED MICRO DEVICES INC
ADVANCED MICRO DEVICES INC
16
AES CORP
AES CORP
AES CORP
AES CORP
AES CORP
29
AFLAC INC
AFLAC INC
AFLAC INC
AFLAC INC
AFLAC INC
25
AGILENT TECHNOLOGIES, INC.
AGILENT T

17
CARNIVAL CORP
CARNIVAL CORP
CARNIVAL CORP
CARNIVAL CORP
CARNIVAL CORP
30
CATERPILLAR INC
CATERPILLAR INC
CATERPILLAR INC
CATERPILLAR INC
CATERPILLAR INC
21
Cboe Global Markets, Inc.
Cboe Global Markets, Inc.
Cboe Global Markets, Inc.
Cboe Global Markets, Inc.
Cboe Global Markets, Inc.
20
CDW Corp
CDW Corp
CDW Corp
CDW Corp
CDW Corp
22
Celanese Corp
Celanese Corp
Celanese Corp
Celanese Corp
Celanese Corp
30
CENTENE CORP
CENTENE CORP
CENTENE CORP
CENTENE CORP
CENTENE CORP
23
CENTERPOINT ENERGY INC
CENTERPOINT ENERGY INC
CENTERPOINT ENERGY INC
CENTERPOINT ENERGY INC
CENTERPOINT ENERGY INC
16
CENTURYLINK, INC
CENTURYLINK, INC
CENTURYLINK, INC
CENTURYLINK, INC
CENTURYLINK, INC
25
CERNER Corp
CERNER Corp
CERNER Corp
CERNER Corp
CERNER Corp
30
CF Industries Holdings, Inc.
CF Industries Holdings, Inc.
CF Industries Holdings, Inc.
CF Industries Holdings, Inc.
CF Industries Holdings, Inc.
15
SCHWAB CHARLES CORP
SCHWAB CHARLES CORP
SCHWAB CHARLES CORP
SCHWAB CHARLES CORP
SCHWAB CHARLES CORP
19

19
Extra Space Storage Inc.
Extra Space Storage Inc.
Extra Space Storage Inc.
Extra Space Storage Inc.
Extra Space Storage Inc.
20
EXXON MOBIL CORP
EXXON MOBIL CORP
EXXON MOBIL CORP
EXXON MOBIL CORP
EXXON MOBIL CORP
18
F5 NETWORKS, INC.
F5 NETWORKS, INC.
F5 NETWORKS, INC.
F5 NETWORKS, INC.
F5 NETWORKS, INC.
28
Facebook Inc
Facebook Inc
Facebook Inc
Facebook Inc
Facebook Inc
25
FASTENAL CO
FASTENAL CO
FASTENAL CO
FASTENAL CO
FASTENAL CO
16
FEDERAL REALTY INVESTMENT TRUST
FEDERAL REALTY INVESTMENT TRUST
FEDERAL REALTY INVESTMENT TRUST
FEDERAL REALTY INVESTMENT TRUST
FEDERAL REALTY INVESTMENT TRUST
24
FEDEX CORP
FEDEX CORP
FEDEX CORP
FEDEX CORP
FEDEX CORP
15
Fidelity National Information Services, Inc.
Fidelity National Information Services, Inc.
Fidelity National Information Services, Inc.
Fidelity National Information Services, Inc.
Fidelity National Information Services, Inc.
25
FIFTH THIRD BANCORP
FIFTH THIRD BANCORP
FIFTH THIRD BANCORP
FIFTH THIRD BANCORP
FIFTH THIRD BANCORP
15
FIRST

In [27]:
#scraping the SEC for the 10-K of each company part 2
for name, cik in zip(df_sp_2['name_clean'], df_sp_2['cik']):
    company = Company(name, cik)
    docs = company.get_10Ks(no_of_documents=6)
    #appending a blank row because the latest 2020 report will not be used
    reports_10K.append('')
    reports_name.append(name)
    
    # generate a random sleep duration to look more 'natural'
    sleep_duration = random.randint(15,30)
    print(sleep_duration)
    time.sleep(sleep_duration) #program sleeps for the sleep_duration
    
    for doc in docs[1:]:
        text = TXTML.parse_full_10K(doc)    
        reports_10K.append(text)
        reports_name.append(name)
        print(name)

17
IPG PHOTONICS CORP
IPG PHOTONICS CORP
IPG PHOTONICS CORP
IPG PHOTONICS CORP
IPG PHOTONICS CORP
16
IQVIA HOLDINGS INC.
IQVIA HOLDINGS INC.
IQVIA HOLDINGS INC.
IQVIA HOLDINGS INC.
IQVIA HOLDINGS INC.
25
IRON MOUNTAIN INC
IRON MOUNTAIN INC
IRON MOUNTAIN INC
IRON MOUNTAIN INC
IRON MOUNTAIN INC
25
HUNT J B TRANSPORT SERVICES INC
HUNT J B TRANSPORT SERVICES INC
HUNT J B TRANSPORT SERVICES INC
HUNT J B TRANSPORT SERVICES INC
HUNT J B TRANSPORT SERVICES INC
16
HENRY JACK & ASSOCIATES INC
HENRY JACK & ASSOCIATES INC
HENRY JACK & ASSOCIATES INC
HENRY JACK & ASSOCIATES INC
HENRY JACK & ASSOCIATES INC
15
JACOBS ENGINEERING GROUP INC /DE/
JACOBS ENGINEERING GROUP INC /DE/
JACOBS ENGINEERING GROUP INC /DE/
JACOBS ENGINEERING GROUP INC /DE/
JACOBS ENGINEERING GROUP INC /DE/
16
J M SMUCKER Co
J M SMUCKER Co
J M SMUCKER Co
J M SMUCKER Co
J M SMUCKER Co
16
JOHNSON & JOHNSON
JOHNSON & JOHNSON
JOHNSON & JOHNSON
JOHNSON & JOHNSON
JOHNSON & JOHNSON
29
Johnson Controls International plc
Johnson Controls I

19
NOBLE ENERGY INC
NOBLE ENERGY INC
NOBLE ENERGY INC
NOBLE ENERGY INC
NOBLE ENERGY INC
27
NORDSTROM INC
NORDSTROM INC
NORDSTROM INC
NORDSTROM INC
NORDSTROM INC
28
NORFOLK SOUTHERN CORP
NORFOLK SOUTHERN CORP
NORFOLK SOUTHERN CORP
NORFOLK SOUTHERN CORP
NORFOLK SOUTHERN CORP
28
NORTHERN TRUST CORP
NORTHERN TRUST CORP
NORTHERN TRUST CORP
NORTHERN TRUST CORP
NORTHERN TRUST CORP
19
NORTHROP GRUMMAN CORP /DE/
NORTHROP GRUMMAN CORP /DE/
NORTHROP GRUMMAN CORP /DE/
NORTHROP GRUMMAN CORP /DE/
NORTHROP GRUMMAN CORP /DE/
20
NortonLifeLock Inc.
NortonLifeLock Inc.
NortonLifeLock Inc.
NortonLifeLock Inc.
NortonLifeLock Inc.
26
Norwegian Cruise Line Holdings Ltd.
Norwegian Cruise Line Holdings Ltd.
Norwegian Cruise Line Holdings Ltd.
Norwegian Cruise Line Holdings Ltd.
Norwegian Cruise Line Holdings Ltd.
19
NRG ENERGY, INC.
NRG ENERGY, INC.
NRG ENERGY, INC.
NRG ENERGY, INC.
NRG ENERGY, INC.
23
NUCOR CORP
NUCOR CORP
NUCOR CORP
NUCOR CORP
NUCOR CORP
15
NVIDIA CORP
NVIDIA CORP
NVIDIA CORP
NVIDIA CORP
NV

17
STANLEY BLACK & DECKER, INC.
STANLEY BLACK & DECKER, INC.
STANLEY BLACK & DECKER, INC.
STANLEY BLACK & DECKER, INC.
STANLEY BLACK & DECKER, INC.
24
STARBUCKS CORP
STARBUCKS CORP
STARBUCKS CORP
STARBUCKS CORP
STARBUCKS CORP
24
STATE STREET CORP
STATE STREET CORP
STATE STREET CORP
STATE STREET CORP
STATE STREET CORP
27
STRYKER CORP
STRYKER CORP
STRYKER CORP
STRYKER CORP
STRYKER CORP
18
SVB FINANCIAL GROUP
SVB FINANCIAL GROUP
SVB FINANCIAL GROUP
SVB FINANCIAL GROUP
SVB FINANCIAL GROUP
23
Synchrony Financial
Synchrony Financial
Synchrony Financial
Synchrony Financial
Synchrony Financial
23
SYNOPSYS INC
SYNOPSYS INC
SYNOPSYS INC
SYNOPSYS INC
SYNOPSYS INC
24
SYSCO CORP
SYSCO CORP
SYSCO CORP
SYSCO CORP
SYSCO CORP
27
T-Mobile US, Inc.
T-Mobile US, Inc.
T-Mobile US, Inc.
T-Mobile US, Inc.
T-Mobile US, Inc.
19
PRICE T ROWE GROUP INC
PRICE T ROWE GROUP INC
PRICE T ROWE GROUP INC
PRICE T ROWE GROUP INC
PRICE T ROWE GROUP INC
25
TAKE TWO INTERACTIVE SOFTWARE INC
TAKE TWO INTERACTIVE SOFTWARE INC

In [28]:
len(reports_name)

2808

In [32]:
# Cleaning the reports
reports_10K = [x.replace('\t','').replace('\n','').replace('\xa0','').replace('x97', '') for x in reports_10K]

In [33]:
pd.DataFrame(reports_name).to_csv('./datasets/reports_name.csv')

In [34]:
pd.DataFrame(reports_10K).to_csv('./datasets/reports_10K.csv')

In [35]:
# creating a new df
df = pd.DataFrame(reports_name, columns=['name'])
df['ticker'] = ticker
df['dates'] = dates

In [36]:
df['dates'] = pd.to_datetime(df['dates'].astype('str'))

In [37]:
df.head(50)

Unnamed: 0,name,ticker,dates
0,3M CO,MMM,2020-02-06
1,3M CO,MMM,2019-02-07
2,3M CO,MMM,2018-02-08
3,3M CO,MMM,2017-02-09
4,3M CO,MMM,2016-02-11
5,3M CO,MMM,2015-02-12
6,SMITH A O CORP,AOS,2020-02-24
7,SMITH A O CORP,AOS,2019-02-15
8,SMITH A O CORP,AOS,2018-02-16
9,SMITH A O CORP,AOS,2017-02-17


## Scraping Stock price from Yahoo Finance

In [38]:
# getting the closing price on the day of annual report release
price_on_date = []

for ticker, dates in zip(df['ticker'], df['dates']):
    offset = timedelta(days=1)
    try:
        price_on_date.append(yf.Ticker(ticker).history(start=str(dates.date()), 
                                              end=str((dates.date()+offset)))['Close'][dates.date()])
    except:
        price_on_date.append(np.NaN)
        pass

- HWM: No data found for this date range, symbol may be delisted
- HWM: No data found for this date range, symbol may be delisted
- HWM: No data found for this date range, symbol may be delisted
- HWM: No data found for this date range, symbol may be delisted
- HWM: No data found for this date range, symbol may be delisted
- NLOK: No data found for this date range, symbol may be delisted
- NLOK: No data found for this date range, symbol may be delisted
- TT: No data found for this date range, symbol may be delisted
- TT: No data found for this date range, symbol may be delisted
- TT: No data found for this date range, symbol may be delisted


In [39]:
# getting the closing price on the day of annual report release
sp_on_date = []
for dates in df['dates']:
    offset = timedelta(days=1)
    try:
        sp_on_date.append(yf.Ticker('^GSPC').history(start=str(dates.date()+offset), 
                                              end=str(dates.date()+offset))['Close'][dates])
    except:
        sp_on_date.append(np.NaN)
        pass

- ^GSPC: No data found for this date range, symbol may be delisted
- ^GSPC: No data found for this date range, symbol may be delisted
- ^GSPC: No data found for this date range, symbol may be delisted
- ^GSPC: No data found for this date range, symbol may be delisted


In [40]:
print(len(price_on_date))
print(len(sp_on_date))

2808
2808


## Compiling into a dataframe

In [41]:
#creating new columns
df['price_on_date'] = price_on_date

# creating columns of difference in stock price
# from date of 1 annual report to the next
df['price_change_next_yr'] = df['price_on_date'].diff()*(-1)
df['price_change_next_yr'][::6] = np.NaN

df['sp_on_date'] = sp_on_date

# creating columns of difference in S&P Index price
# from date of 1 annual report to the next
df['sp_price_change_next_year'] = df['sp_on_date'].diff()*(-1)
df['sp_price_change_next_year'][::6] = np.NaN

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [42]:
df['reports'] = reports_10K

In [43]:
#checking the newly appeneded columns
df.isnull().sum()

name                           0
ticker                         0
dates                          0
price_on_date                 14
price_change_next_yr         488
sp_on_date                     4
sp_price_change_next_year    476
reports                        0
dtype: int64

In [44]:
df.describe()

Unnamed: 0,price_on_date,price_change_next_yr,sp_on_date,sp_price_change_next_year
count,2794.0,2320.0,2804.0,2332.0
mean,103.228114,12.712384,2515.896958,218.067719
std,175.135896,55.290577,431.676773,248.793395
min,-436.16,-459.21,1829.08,-691.65
25%,41.3925,-2.8325,2109.66,44.22
50%,67.315,5.905,2545.94,284.58
75%,112.385,18.4,2784.49,415.59
max,3965.95,1365.15,3386.15,804.6


In [45]:
df.head(10)

Unnamed: 0,name,ticker,dates,price_on_date,price_change_next_yr,sp_on_date,sp_price_change_next_year,reports
0,3M CO,MMM,2020-02-06,159.84,,3345.78,,
1,3M CO,MMM,2019-02-07,189.71,-29.87,2706.05,639.73,mmm_Current_Folio_10KlowUNITED STATESSECURITIE...
2,3M CO,MMM,2018-02-08,206.12,-16.41,2581.0,125.05,mmm_Current_Folio_10KTable of ContentslowUNITE...
3,3M CO,MMM,2017-02-09,161.0,45.12,2307.87,273.13,mmm_Current_Folio_10KTable of ContentslowUNITE...
4,3M CO,MMM,2016-02-11,133.54,27.46,1829.08,478.79,mmm_Current_Folio_10KTable of ContentslowUNITE...
5,3M CO,MMM,2015-02-12,143.14,-9.6,2088.48,-259.4,Table of ContentsUNITED STATESSECURITIES AND...
6,SMITH A O CORP,AOS,2020-02-24,42.6,,3225.89,,
7,SMITH A O CORP,AOS,2019-02-15,50.37,-7.77,2775.6,450.29,Form 10-K Table of Contents UNITED STATES SEC...
8,SMITH A O CORP,AOS,2018-02-16,62.93,-12.56,2732.22,43.38,Form 10-K Table of Contents UNITED STATES SEC...
9,SMITH A O CORP,AOS,2017-02-17,47.13,15.8,2351.16,381.06,Form 10-K Table of Contents UNITED STATES SEC...


In [46]:
df.shape

(2808, 8)

In [47]:
df.to_csv('./datasets/df.csv', index=False)