In [1]:
import pandas as pd
import numpy as np

# Create list of columns to use
cols = ["zipcode", "agi_stub", "mars1", "MARS2", "NUMDEP"]

# Create dataframe from csv using only selected columns
data = pd.read_csv("vt_tax_data_2016-Data Engineering.csv", usecols=cols)

# View counts of dependents and tax returns by income level
print(data.groupby("agi_stub").sum())

          zipcode   mars1  MARS2  NUMDEP
agi_stub                                
1         1439444  170320  28480   52490
2         1439444  104000  37690   64660
3         1439444   39160  45390   47330
4         1439444   11670  44410   37760
5         1439444    7820  67750   60730
6         1439444    1210  16340   16300


In [2]:
# Use nrows and skiprows to make a dataframe, vt_data_next500, with the next 500 rows.
# Set the header argument so that pandas knows there is no header row.
# Name the columns in vt_data_next500 by supplying a list of vt_data_first500's columns to the names argument.




In [3]:
# Create dataframe of next 500 rows with labeled columns
vt_data_first500 = pd.read_csv("vt_tax_data_2016-Data Engineering.csv")
vt_data_next500 = pd.read_csv("vt_tax_data_2016-Data Engineering.csv", 
                       nrows=500, skiprows=500,
                        header= None,
                       names=list(vt_data_first500))

# View the Vermont dataframes to confirm they're different
print(vt_data_first500.head())
print(vt_data_next500.head())

   STATEFIPS STATE  zipcode  agi_stub      N1  mars1  MARS2  MARS4   PREP  \
0         50    VT        0         1  111580  85090  14170  10740  45360   
1         50    VT        0         2   82760  51960  18820  11310  35600   
2         50    VT        0         3   46270  19540  22650   3620  24140   
3         50    VT        0         4   30070   5830  22190    960  16060   
4         50    VT        0         5   39530   3900  33800    590  22500   

       N2  ...  N10300  A10300  N85530  A85530  N85300  A85300  N11901  \
0  130630  ...   53660   50699       0       0       0       0   10820   
1  132950  ...   74340  221146       0       0       0       0   12820   
2   91870  ...   44860  266097       0       0       0       0   10810   
3   71610  ...   29580  264678       0       0       0       0    7320   
4  103710  ...   39170  731963      40      24       0       0   12500   

   A11901  N11902  A11902  
0    9734   88260  138337  
1   20029   68760  151729  
2   2449

In [4]:
# Load vt_tax_data_2016.csv with no arguments and view the dataframe's dtypes attribute.
# Note the data types of zipcode and agi_stub.

# Load csv with no additional arguments
data = pd.read_csv("vt_tax_data_2016-Data Engineering.csv")

# Print the data types
print(data.dtypes)

STATEFIPS     int64
STATE        object
zipcode       int64
agi_stub      int64
N1            int64
              ...  
A85300        int64
N11901        int64
A11901        int64
N11902        int64
A11902        int64
Length: 147, dtype: object


In [5]:
# Create dict specifying data types for agi_stub and zipcode
data_types = {'agi_stub':'category',  'zipcode':'str'}

# Load csv using dtype to set correct data types
data = pd.read_csv("vt_tax_data_2016.csv", dtype = data_types)

# Print data types of resulting frame
print(data.dtypes.head())

STATEFIPS       int64
STATE          object
zipcode        object
agi_stub     category
N1              int64
dtype: object


In [6]:
# Create dict specifying that 0s in zipcode are NA values
null_values = {'zipcode':0}

# Load csv using na_values keyword argument
data = pd.read_csv("vt_tax_data_2016-Data Engineering.csv", 
                   na_values = null_values)

# View rows with NA ZIP codes
print(data[data.zipcode.isna()])

   STATEFIPS STATE  zipcode  agi_stub      N1  mars1  MARS2  MARS4   PREP  \
0         50    VT      NaN         1  111580  85090  14170  10740  45360   
1         50    VT      NaN         2   82760  51960  18820  11310  35600   
2         50    VT      NaN         3   46270  19540  22650   3620  24140   
3         50    VT      NaN         4   30070   5830  22190    960  16060   
4         50    VT      NaN         5   39530   3900  33800    590  22500   
5         50    VT      NaN         6    9620    600   8150      0   7040   

       N2  ...  N10300  A10300  N85530  A85530  N85300  A85300  N11901  \
0  130630  ...   53660   50699       0       0       0       0   10820   
1  132950  ...   74340  221146       0       0       0       0   12820   
2   91870  ...   44860  266097       0       0       0       0   10810   
3   71610  ...   29580  264678       0       0       0       0    7320   
4  103710  ...   39170  731963      40      24       0       0   12500   
5   26430  ...  

In [7]:
try:
  # Import the CSV without any keyword arguments
  data = pd.read_csv('vt_tax_data_2016.csv')
  
  # View first 5 records
  print(data.head())
  
except pd.errors.ParserError:
    print("Your data contained rows that could not be parsed.")

   STATEFIPS STATE  zipcode  agi_stub      N1  mars1  MARS2  MARS4   PREP  \
0         50    VT        0         1  111580  85090  14170  10740  45360   
1         50    VT        0         2   82760  51960  18820  11310  35600   
2         50    VT        0         3   46270  19540  22650   3620  24140   
3         50    VT        0         4   30070   5830  22190    960  16060   
4         50    VT        0         5   39530   3900  33800    590  22500   

       N2  ...  N10300  A10300  N85530  A85530  N85300  A85300  N11901  \
0  130630  ...   53660   50699       0       0       0       0   10820   
1  132950  ...   74340  221146       0       0       0       0   12820   
2   91870  ...   44860  266097       0       0       0       0   10810   
3   71610  ...   29580  264678       0       0       0       0    7320   
4  103710  ...   39170  731963      40      24       0       0   12500   

   A11901  N11902  A11902  
0    9734   88260  138337  
1   20029   68760  151729  
2   2449

In [8]:
# try:
#   # Import CSV with error_bad_lines set to skip bad records
#   data = pd.read_csv("vt_tax_data_2016_corrupt.csv", 
#                      error_bad_lines = False)
  
#   # View first 5 records
#   print(data.head())
  
# except pd.errors.ParserError:
#     print("Your data contained rows that could not be parsed.")

In [9]:
try:
  # Set warn_bad_lines to issue warnings about bad records
#   data = pd.read_csv("vt_tax_data_2016_corrupt.csv", 
#                      error_bad_lines=False, 
#                      warn_bad_lines = True)
  
#   # View first 5 records
  print(data.head())
  
# except pd.errors.ParserError:
#     print("Your data contained rows that could not be parsed.")

SyntaxError: unexpected EOF while parsing (Temp/ipykernel_19548/1702412426.py, line 11)

In [None]:
# Load pandas as pd
import pandas as pd


# Read spreadsheet and assign it to survey_responses
survey_responses = pd.read_excel('fcc-new-coder-survey-data Engineering.xlsx')

# View the head of the dataframe
print(survey_responses.head())

In [None]:
# Create string of lettered columns to load
col_string = 'AD,AW:BA'

# Load data with skiprows and usecols set
survey_responses = pd.read_excel("fcc-new-coder-survey-data Engineering.xlsx", 
                        usecols=col_string,skiprows=2)

# View the names of the columns selected
print(survey_responses.columns)
print(survey_responses.head())

In [None]:
survey_responses = pd.read_excel("fcc-new-coder-survey-data Engineering.xlsx")
survey_responses.head()

In [None]:
# Create df from second worksheet by referencing its position
responses_2017 = pd.read_excel("fcc-new-coder-survey-data Engineering.xlsx",
                               sheet_name=1)
responses_2017.head()
# # Graph where people would like to get a developer job
# job_prefs = responses_2017.groupby("JobPref").JobPref.count()
# job_prefs.plot.barh()
# plt.show()

In [None]:
# # Create df from second worksheet by referencing its name
# responses_2017 = pd.read_excel("fcc_survey.xlsx",
#                                sheet_name='2017')

# # Graph where people would like to get a developer job
# job_prefs = responses_2017.groupby("JobPref").JobPref.count()
# job_prefs.plot.barh()
# plt.show()

In [None]:
# # Load both the 2016 and 2017 sheets by name
# all_survey_data = pd.read_excel("fcc-new-coder-survey-data Engineering.xlsx",
#                                 sheet_name=['2016','2017'])

# # View the data type of all_survey_data
# print(type(all_survey_data))
# print(all_survey_data)

In [None]:
# Load all sheets in the Excel file
all_survey_data = pd.read_excel("fcc-new-coder-survey-data Engineering.xlsx",
                                sheet_name=[0,'2017'])

# View the sheet names in all_survey_data
print(all_survey_data.keys())

In [None]:
# Load all sheets in the Excel file
all_survey_data = pd.read_excel("fcc-new-coder-survey-data Engineering.xlsx",
                                sheet_name=None)

# View the sheet names in all_survey_data
print(all_survey_data.keys())

In [None]:
# # Create an empty dataframe
# all_responses = pd.DataFrame()

# # Set up for loop to iterate through values in responses
# for df in responses.values():
#   # Print the number of rows being added
#   print("Adding {} rows".format(df.shape[0]))
#   # Append df to all_responses, assign result
#   all_responses = all_responses.append(df)

# # Graph employment statuses in sample
# counts = all_responses.groupby("EmploymentStatus").EmploymentStatus.count()
# counts.plot.barh()
# plt.show()

In [None]:
survey_data = pd.read_excel("fcc-new-coder-survey-data Engineering.xlsx")

In [None]:
survey_data.head()

In [None]:
# Set dtype to load appropriate column(s) as Boolean data
# survey_data = pd.read_excel('fcc_survey_subset.xlsx')
# survey_data.head()
# survey_data = pd.read_excel("fcc_survey_subset.xlsx",
#                             dtype = {'HasDebt':bool})

# View financial burdens by Boolean group
# print(survey_data.groupby('HasDebt').sum())

In [None]:
# Load file with Yes as a True value and No as a False value
# survey_subset = pd.read_excel("fcc_survey_yn_data.xlsx",
#                               dtype={"HasDebt": bool,
#                               "AttendedBootCampYesNo": bool},
#                               true_values=['Yes'],
#                               false_values=['No'])

# # View the data
# print(survey_subset.head())

In [None]:
# # Load file, with Part1StartTime parsed as datetime data
# survey_data = pd.read_excel("fcc_survey.xlsx",
#                             parse_dates=['Part1StartTime'])

# # Print first few values of Part1StartTime
# print(survey_data.Part1StartTime.head())

In [None]:
# # Create dict of columns to combine into new datetime column
# datetime_cols = {"Part2Start": ['Part2StartDate','Part2StartTime']}


# # Load file, supplying the dict to parse_dates
# survey_data = pd.read_excel("fcc_survey_dts.xlsx",
#                             parse_dates=datetime_cols)

# # View summary statistics about Part2Start
# print(survey_data.Part2Start.describe())

In [None]:
# # Parse datetimes and assign result back to Part2EndTime
# survey_data["Part2EndTime"] = pd.to_datetime(survey_data["Part2EndTime"], 
#                                              format="%m%d%Y %H:%M:%S")

# # Print first few values of Part2EndTime
# print(survey_data.Part2EndTime.head())

In [None]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///fcc-new-coder-survey-data Engineering.db')

In [None]:
# engine.fcc-new-coder-survey-data Engineering_names

In [None]:
# # Load libraries
# import pandas as pd
# from sqlalchemy import create_engine

# # Create the database engine
# engine = create_engine('sqlite:///data.db')

# # Load hpd311calls without any SQL
# hpd_calls = pd.read_sql('hpd311calls', engine)

# # View the first few rows of data
# print(hpd_calls.head())

In [None]:
# # Create the database engine
# engine = create_engine("sqlite:///data.db")

# # Create a SQL query to load the entire weather table
# query = """
# SELECT *
#   FROM weather;
# """

# # Load weather with the SQL query
# weather = pd.read_sql(query, engine)

# # View the first few rows of data
# print(weather.head())

In [None]:
# # Create database engine for data.db
# engine = create_engine('sqlite:///data.db')

# # Write query to get date, tmax, and tmin from weather
# query = """
# SELECT date, 
#        tmax, 
#        tmin
#   FROM weather;
# """

# # Make a dataframe by passing query and engine to read_sql()
# temperatures = pd.read_sql(query,engine)

# # View the resulting dataframe
# print(temperatures)

In [None]:
# # Create query to get hpd311calls records about safety
# query = """
# SELECT *
# from hpd311calls
# where complaint_type = 'SAFETY';
# """

# # Query the database and assign result to safety_calls
# safety_calls = pd.read_sql(query,engine)

# # Graph the number of safety calls by borough
# call_counts = safety_calls.groupby('borough').unique_key.count()
# call_counts.plot.barh()
# plt.show()

In [None]:
# # Create query for records with max temps <= 32 or snow >= 1
# query = """
# SELECT *
#   FROM weather
#   where tmax <=32
#   or snow >=1;
# """

# # Query database and assign result to wintry_days
# wintry_days = pd.read_sql(query,engine)

# # View summary stats about the temperatures
# print(wintry_days.describe())

In [None]:
# # Create query for unique combinations of borough and complaint_type
# query = """
# SELECT distinct borough, 
#        complaint_type
#   from hpd311calls;
# """

# # Load results of query to a dataframe
# issues_and_boros = pd.read_sql(query,engine)

# # Check assumption about issues and boroughs
# print(issues_and_boros.head())

In [None]:
# # Create query to get call counts by complaint_type
# query = """
# select complaint_type, 
#      count(*)
#   FROM hpd311calls
#   group by complaint_type;
# """

# # Create dataframe of call counts by issue
# calls_by_issue = pd.read_sql(query, engine)

# # Graph the number of calls for each housing issue
# calls_by_issue.plot.barh(x="complaint_type")
# plt.show()

In [None]:
# # Create a query to get month, max tmax, and min tmin by month
# query = """
# SELECT month, 
# 	   MAX(tmax), 
#        MIN(tmin)
#   FROM weather 
#  GROUP BY month;
# """

# # Get dataframe of monthly weather stats
# weather_by_month = pd.read_sql(query, engine)

# # View weather stats by month
# print(weather_by_month)

In [None]:
# # Query to join weather to call records by date columns
# query = """
# SELECT * 
#   FROM hpd311calls
#   JOIN weather 
#   ON hpd311calls.created_date = weather.date;
# """

# # Create dataframe of joined tables
# calls_with_weather = pd.read_sql(query, engine)

# # View the dataframe to make sure all columns were joined
# print(calls_with_weather.head())

In [None]:
# # Query to get hpd311calls and precipitation values
# query = """
# SELECT hpd311calls.*, weather.prcp
#   FROM hpd311calls
#   JOIN weather
#     ON hpd311calls.created_date = weather.date;"""

# # Load query results into the leak_calls dataframe
# leak_calls = pd.read_sql(query, engine)

# # View the dataframe
# print(leak_calls.head())

In [None]:
# # Query to get water leak calls and daily precipitation
# query = """
# SELECT hpd311calls.*, weather.prcp
#   FROM hpd311calls
#   JOIN weather
#     ON hpd311calls.created_date = weather.date
#   where hpd311calls.complaint_type = 'WATER LEAK';"""

# # Load query results into the leak_calls dataframe
# leak_calls = pd.read_sql(query, engine)

# # View the dataframe
# print(leak_calls.head())

In [None]:
# # Query to get heat/hot water call counts by created_date
# query = """
# SELECT hpd311calls.created_date, 
#        COUNT(*)
#   FROM hpd311calls 
#  WHERE hpd311calls.complaint_type = 'HEAT/HOT WATER' 
#  GROUP BY hpd311calls.created_date;
# """

# # Query database and save results as df
# df = pd.read_sql(query, engine)

# # View first 5 records
# print(df.head())

In [None]:
# # Modify query to join tmax and tmin from weather by date
# query = """
# SELECT hpd311calls.created_date, 
# 	   COUNT(*), 
#        weather.tmax,
#        weather.tmin
#   FROM hpd311calls 
#        join weather
#        on hpd311calls.created_date = weather.date
#  WHERE hpd311calls.complaint_type = 'HEAT/HOT WATER' 
#  GROUP BY hpd311calls.created_date;
#  """

# # Query database and save results as df
# df = pd.read_sql(query, engine)

# # View first 5 records
# print(df.head())

In [None]:
# # Load pandas as pd
# import pandas as pd

# # Load the daily report to a dataframe
# pop_in_shelters = pd.read_json('dhs_daily_report.json')

# # View summary stats about pop_in_shelters
# print(pop_in_shelters.describe())

In [None]:
# try:
#     # Load the JSON with orient specified
#     df = pd.read_json("dhs_report_reformatted.json",
#                       orient='split')
    
#     # Plot total population in shelters over time
#     df["date_of_census"] = pd.to_datetime(df["date_of_census"])
#     df.plot(x="date_of_census", 
#             y="total_individuals_in_shelter")
#     plt.show()
    
# except ValueError:
#     print("pandas could not parse the JSON.")

In [None]:
# Process of getting data from API
#send and get data from API
# not tied to a particular API
# requests.get() to getr data from a url-requests.get(url_string)
# Keyword arguement are: Params:takes a dictionary of parameter and values to customize API request
# header: take a dictionary of value in providing user authentication to API
# response.json() will return the json data.


In [None]:
# params = {'terms':'Bookstore','location':'Francesisco'}
# header = {'Authorization','appl_key'}

In [None]:
# response = request.get(api_url, params=params,headers=headers)
# data = response.json()

In [None]:
# import requests

# api_url = "https://api.yelp.com/v3/businesses/search"
# params = {'terms':'Bookstore','location':'Francesisco'}
# headers = {'Authorization':api_url}

# # Get data about NYC cafes from the Yelp API
# response = requests.get(api_url, 
#                 headers=headers, 
#                 params=params)

# # Extract JSON data from the response
# data = response.json()
# data

# # Load data to a dataframe
# cafes = pd.DataFrame(data['businesses'])

#  # View the data's dtypes
# print(cafes.dtypes)

In [None]:
# # Create dictionary to query API for cafes in NYC
# parameters = {"term": "cafe",
#           	  "location": "NYC"}

# # Query the Yelp API with headers and params set
# response = requests.get(api_url, 
#                         headers=headers, 
#                         params=parameters)

# # Extract JSON data from response
# data = response.json()

# # Load "businesses" values to a dataframe and print head
# cafes = pd.DataFrame(data["businesses"])
# print(cafes.head())

In [None]:
# # Create dictionary that passes Authorization and key string
# headers = {'Authorization': "Bearer {}".format(api_key)}

# # Query the Yelp API with headers and params set
# response = requests.get(api_url,params=params,headers=headers)



# # Extract JSON data from response
# data = response.json()

# # Load "businesses" values to a dataframe and print names
# cafes = pd.DataFrame(data['businesses'])
# print(cafes.name)

In [None]:
# from pandas.io.json import json_normalize
# # Load json_normalize()
# from pandas.io.json import json_normalize

# # Isolate the JSON data from the API response
# data = response.json()

# # Flatten business data into a dataframe, replace separator
# cafes = json_normalize(data["businesses"],
#              sep = "_")

# # View data
# print(cafes.head())

# # Specify record path to get categories data
# flat_cafes = json_normalize(data["businesses"],
#                             sep="_",
#                     		record_path='categories')

# # View the data
# print(flat_cafes.head())

In [None]:
# # Load other business attributes and set meta prefix
# flat_cafes = json_normalize(data["businesses"],
#                             sep="_",
#                     		record_path="categories",
#                     		meta=['name', 
#                                   'alias',  
#                                   'rating',
#                           		  ['coordinates', 'latitude'], 
#                           		  ['coordinates', 'longitude']],
#                     		meta_prefix='biz_')





# # View the data
# print(flat_cafes.head())

In [None]:
# # Add an offset parameter to get cafes 51-100
# params = {"term": "cafe", 
#           "location": "NYC",
#           "sort_by": "rating", 
#           "limit": 50,
#           'offset':50}

# result = requests.get(api_url, headers=headers, params=params)
# next_50_cafes = json_normalize(result.json()["businesses"])

# # Append the results, setting ignore_index to renumber rows
# cafes = top_50_cafes.append(next_50_cafes,ignore_index=True)

# # Print shape of cafes
# print(cafes.shape)