In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests
from urllib import parse
from bs4.element import Comment
import re
import string


In [2]:
#url irs.gov webpage with links to text instructions for all forms
url = "https://www.irs.gov/instructions"
res = requests.get(url)

In [3]:
#clean up webpage and only extract table with pdf links to all forms
soup = BeautifulSoup(res.content, 'lxml')
table = soup.find('div', {'class': 'table-responsive'})

In [4]:
df = pd.read_html(str(table))[0]


In [5]:
len(df)

367

In [6]:
#make sure we're only gathering links to description/instructions for each form; a pdf format is unable to be saved as a csv file
links = []

#this method is selecting random links at every 20 elements
for anchor in table.findAll('a'):
    href = anchor['href']
    if href.endswith('pdf'):
        continue
    links.append(href)

selected_links = links[::20]

    

In [36]:
#here we're looking for specific forms to match with, given these are the most common forms for an average citizen
reg = 'W\-7'
matchers = ['941','1041', 'w7', 'w8']
relevant_links = [s for s in links if any(xs in s for xs in matchers)]


In [38]:
len(relevant_links)

23

In [39]:
#confirm that slice is incrementing links every 25
print(len(links))
print(len(relevant_links))

367
23


In [41]:
text = []
for link in relevant_links:
    res = requests.get(link)
    soup = BeautifulSoup(res.content, 'lxml')
    content = soup.find('div', {'class': 'book'})
    article = content.text
    text.append(article)

In [42]:
text

['\n\n\nInstructions for Form 941 (01/2020)\n\nEmployer\'s QUARTERLY Federal Tax Return\n\n\n\nSection references are to the Internal Revenue Code unless otherwise noted.\n\n\n\nRevised: 01/2020\n\n\n\n\n\n\n\nInstructions for Form 941 - Introductory Material\n\n\n\n\n\n\nFuture Developments\nFor the latest information about developments related to Form 941 and its instructions, such as legislation enacted after they were published, go to IRS.gov/Form941.\n\n\n\xa0\nWhat\'s New\n\n2020 withholding tables.\nThe federal income tax withholding tables are now included in Pub. 15-T, Federal Income Tax Withholding Methods.\n\n\nSocial security and Medicare tax for 2020.\nThe social security tax rate is 6.2% each for the employee and employer, unchanged from 2019. The social security wage base limit is $137,700.\nThe Medicare tax rate is 1.45% each for the employee and employer, unchanged from 2019. There is no wage base limit for Medicare tax.\nSocial security and Medicare taxes apply to the

In [43]:
for t in range(len(text)):
    text[t] = text[t].replace("\n", " ")

In [44]:
for t in range(len(text)):
    text[t].replace("\ ", "")

In [45]:
for t in range(len(text)):
    text[t].replace(u'\xa0', u'')

In [46]:
#verify text appears cleaner
text

['   Instructions for Form 941 (01/2020)  Employer\'s QUARTERLY Federal Tax Return    Section references are to the Internal Revenue Code unless otherwise noted.    Revised: 01/2020        Instructions for Form 941 - Introductory Material       Future Developments For the latest information about developments related to Form 941 and its instructions, such as legislation enacted after they were published, go to IRS.gov/Form941.   \xa0 What\'s New  2020 withholding tables. The federal income tax withholding tables are now included in Pub. 15-T, Federal Income Tax Withholding Methods.   Social security and Medicare tax for 2020. The social security tax rate is 6.2% each for the employee and employer, unchanged from 2019. The social security wage base limit is $137,700. The Medicare tax rate is 1.45% each for the employee and employer, unchanged from 2019. There is no wage base limit for Medicare tax. Social security and Medicare taxes apply to the wages of household workers you pay $2,200

In [47]:
n = df.columns[1]

In [48]:
df.drop(n, axis = 1, inplace = True)

In [51]:
df.head()

Unnamed: 0,Title,Download PDF
0,Instructions for Form 56 (12/2019),i56.pdf
1,Instructions for Form 461 (2019),i461.pdf
2,Instructions for Form 706 (08/2019),i706.pdf
3,Instructions for Form 706-A (08/2019),i706a.pdf
4,Instructions for Form 706-D (12/2008),i706d.pdf


In [59]:

# list_of_selected_links = df.Title[::20]
# selected_df = df[df.Title.isin(list_of_selected_links)]
series = df["Download PDF"].str.contains('941|1041|w7|w8')
# df[df['Download PDF'].str.match('941|1041|w7|w8')== True]
relevant_df = df[series]

In [60]:
relevant_df[n] = text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [61]:
relevant_df.drop("Download PDF", axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [63]:
relevant_df.to_csv("specific_irs_forms.csv")