In [2]:
# import dependencies
import pandas as pd
import os
import csv
import datetime as dt

In [6]:
# create the file_path to the data
from codecs import utf_8_decode


file_path = 'massachusetts_average_income.xlsx'

# read in the data as a df
df = pd.read_csv(file_path)
df

TypeError: read_csv() got an unexpected keyword argument 'encode'

In [None]:
#view the data columns
df.columns

Index(['status', 'price', 'bed', 'bath', 'acre_lot', 'full_address', 'street',
       'city', 'state', 'zip_code', 'house_size', 'sold_date', 'Unnamed: 12',
       'Unnamed: 13', 'Unnamed: 14'],
      dtype='object')

In [None]:
#drop unnecessary columns
df = df.drop(columns=[
             "bed",
             "bath",
             "acre_lot",
             "full_address",
             "Unnamed: 12", 
             "Unnamed: 13", 
             "Unnamed: 14"])
df.head()

Unnamed: 0,status,price,street,city,state,zip_code,house_size,sold_date
0,for_sale,169900.0,420 Main St Apt 42,Agawam,Massachusetts,1001.0,892.0,4/30/2019
1,for_sale,525000.0,955 River Rd,Agawam,Massachusetts,1001.0,2314.0,6/25/2014
2,for_sale,289900.0,82 Harvey Johnson Dr,Agawam,Massachusetts,1001.0,1276.0,10/12/2012
3,for_sale,239900.0,15 Ash Ln Unit 15,Agawam,Massachusetts,1001.0,1229.0,6/7/2016
4,for_sale,249900.0,181 Brookfield Ln Unit 181,Agawam,Massachusetts,1001.0,860.0,1/17/1997


In [None]:
# drop null values
df = df.dropna()
df.head()

Unnamed: 0,status,price,street,city,state,zip_code,house_size,sold_date
0,for_sale,169900.0,420 Main St Apt 42,Agawam,Massachusetts,1001.0,892.0,4/30/2019
1,for_sale,525000.0,955 River Rd,Agawam,Massachusetts,1001.0,2314.0,6/25/2014
2,for_sale,289900.0,82 Harvey Johnson Dr,Agawam,Massachusetts,1001.0,1276.0,10/12/2012
3,for_sale,239900.0,15 Ash Ln Unit 15,Agawam,Massachusetts,1001.0,1229.0,6/7/2016
4,for_sale,249900.0,181 Brookfield Ln Unit 181,Agawam,Massachusetts,1001.0,860.0,1/17/1997


In [None]:
# view the data types
df.dtypes

status         object
price         float64
street         object
city           object
state          object
zip_code      float64
house_size    float64
sold_date      object
dtype: object

In [None]:
#convert the data types

# convert the Zip code to a string and add a 0
df['zip_code'] = df['zip_code'].astype(str)
df['zip_code'] = '0' + df.zip_code.str.split('.').str[0]

# convert the price column to an integer
df['price'] = df['price'].astype(int)

# reduce the state column to say 'MA'
df['state'] = df['state'].astype(str)
df['state'] = 'MA'

df['sold_date'] = pd.to_datetime(df['sold_date'], format='%m/%d/%Y')

df.head()

Unnamed: 0,status,price,street,city,state,zip_code,house_size,sold_date
0,for_sale,169900,420 Main St Apt 42,Agawam,MA,1001,892.0,2019-04-30
1,for_sale,525000,955 River Rd,Agawam,MA,1001,2314.0,2014-06-25
2,for_sale,289900,82 Harvey Johnson Dr,Agawam,MA,1001,1276.0,2012-10-12
3,for_sale,239900,15 Ash Ln Unit 15,Agawam,MA,1001,1229.0,2016-06-07
4,for_sale,249900,181 Brookfield Ln Unit 181,Agawam,MA,1001,860.0,1997-01-17


In [None]:
# sort the df by sold_date
df = df.sort_values(by='sold_date', ascending=False)
df.head()

Unnamed: 0,status,price,street,city,state,zip_code,house_size,sold_date
61156,for_sale,4299000,11 Peter Williamson Blvd,Oak Bluffs,MA,2557,4200.0,2022-12-01
63272,for_sale,4299000,11 Peter Williamson Blvd,Oak Bluffs,MA,2557,4200.0,2022-12-01
63202,for_sale,1895000,34 Double Ox Rd,Oak Bluffs,MA,2557,2407.0,2022-09-30
61160,for_sale,1895000,34 Double Ox Rd,Oak Bluffs,MA,2557,2407.0,2022-09-30
62655,for_sale,940000,5B Pine Tree Rd Unit 1,Nantucket,MA,2554,798.0,2022-06-01


In [None]:
# filter the df by date to retain only data for 2018-2022
df = df.loc[(df['sold_date'] > '2017/12/31')]
df.head()

Unnamed: 0,status,price,street,city,state,zip_code,house_size,sold_date
61156,for_sale,4299000,11 Peter Williamson Blvd,Oak Bluffs,MA,2557,4200.0,2022-12-01
63272,for_sale,4299000,11 Peter Williamson Blvd,Oak Bluffs,MA,2557,4200.0,2022-12-01
63202,for_sale,1895000,34 Double Ox Rd,Oak Bluffs,MA,2557,2407.0,2022-09-30
61160,for_sale,1895000,34 Double Ox Rd,Oak Bluffs,MA,2557,2407.0,2022-09-30
62655,for_sale,940000,5B Pine Tree Rd Unit 1,Nantucket,MA,2554,798.0,2022-06-01


In [None]:
# drop duplicate data
df = df.drop_duplicates()
df.head()

Unnamed: 0,status,price,street,city,state,zip_code,house_size,sold_date
61156,for_sale,4299000,11 Peter Williamson Blvd,Oak Bluffs,MA,2557,4200.0,2022-12-01
63202,for_sale,1895000,34 Double Ox Rd,Oak Bluffs,MA,2557,2407.0,2022-09-30
62655,for_sale,940000,5B Pine Tree Rd Unit 1,Nantucket,MA,2554,798.0,2022-06-01
60962,for_sale,8900000,76 Menemsha Inn Rd,Chilmark,MA,2535,4033.0,2022-05-31
62633,for_sale,849995,40 1/2 Essex Rd,Nantucket,MA,2554,1030.0,2022-05-30


In [None]:
# drop erroneous data
df = df.loc[(df['sold_date'] < '2022/12/01')]
df.head()

Unnamed: 0,status,price,street,city,state,zip_code,house_size,sold_date
63202,for_sale,1895000,34 Double Ox Rd,Oak Bluffs,MA,2557,2407.0,2022-09-30
62655,for_sale,940000,5B Pine Tree Rd Unit 1,Nantucket,MA,2554,798.0,2022-06-01
60962,for_sale,8900000,76 Menemsha Inn Rd,Chilmark,MA,2535,4033.0,2022-05-31
62633,for_sale,849995,40 1/2 Essex Rd,Nantucket,MA,2554,1030.0,2022-05-30
64774,for_sale,950000,8 Myrtle Ave,Oak Bluffs,MA,2557,1320.0,2022-05-27


In [None]:
# reset the index
df = df.reset_index()
df.tail()

Unnamed: 0,index,status,price,street,city,state,zip_code,house_size,sold_date
436,301,for_sale,299900,26 Cold Hill Dr,Granby,MA,1033,880.0,2018-03-19
437,52975,for_sale,825000,2 Hawthorne Pl Apt 3M,Boston,MA,2114,1197.0,2018-03-14
438,55889,for_sale,1200000,45 Alban St Unit 1,Boston,MA,2124,2501.0,2018-03-05
439,17468,for_sale,324900,123 Sachem Ave,Worcester,MA,1606,908.0,2018-02-13
440,5275,for_sale,139000,137 N East St,Holyoke,MA,1040,1644.0,2018-01-18


In [None]:
df = df["index"].drop_duplicates()
df.head()

NameError: name 'df' is not defined