### Input Dependencies & Load Data

In [1]:
# Import dependencies
import pandas as pd

# # FOR GEOPY
# from geopy.geocoders import Nominatim
# from geopy.extra.rate_limiter import RateLimiter

# # FOR GOOGLE MAPS
# import googlemaps

In [2]:
# Create real estate dataframe
real_estate_df=pd.read_csv("Resources/Real_Estate_Data.csv")
real_estate_df.head()

Unnamed: 0,status,price,bed,bath,acre_lot,full_address,street,city,state,zip_code,house_size,sold_date
0,for_sale,105000.0,3.0,2.0,0.12,"Sector Yahuecas Titulo # V84, Adjuntas, PR, 00601",Sector Yahuecas Titulo # V84,Adjuntas,Puerto Rico,601.0,920.0,
1,for_sale,80000.0,4.0,2.0,0.08,"Km 78 9 Carr # 135, Adjuntas, PR, 00601",Km 78 9 Carr # 135,Adjuntas,Puerto Rico,601.0,1527.0,
2,for_sale,67000.0,2.0,1.0,0.15,"556G 556-G 16 St, Juana Diaz, PR, 00795",556G 556-G 16 St,Juana Diaz,Puerto Rico,795.0,748.0,
3,for_sale,145000.0,4.0,2.0,0.1,"R5 Comunidad El Paraso Calle De Oro R-5 Ponce,...",R5 Comunidad El Paraso Calle De Oro R-5 Ponce,Ponce,Puerto Rico,731.0,1800.0,
4,for_sale,65000.0,6.0,2.0,0.05,"14 Navarro, Mayaguez, PR, 00680",14 Navarro,Mayaguez,Puerto Rico,680.0,,


In [3]:
# Create Trader Joe's dataframe
trader_joes_df=pd.read_csv("Resources/Trader_Joes_Stores.csv")
trader_joes_df.head()

Unnamed: 0,name,latitude,longitude,street,city,state,zip,phone,website
0,Trader Joe's Houston - Alabama Theater (426),29.739211,-95.411323,2922 S Shepherd Drive,Houston,TX,77098,713-526-4034,https://locations.traderjoes.com/tx/houston/426/
1,Trader Joe's Houston - West (431),29.735406,-95.582487,11683 Westheimer Rd,Houston,TX,77077,281-496-2488,https://locations.traderjoes.com/tx/houston/431/
2,Trader Joe's Houston - S Voss Rd (427),29.752713,-95.501667,1440 South Voss Road,Houston,TX,77057,713-266-2377,https://locations.traderjoes.com/tx/houston/427/
3,Trader Joe's McKinney (408),33.173697,-96.642874,2851 Craig Dr Ste 100,McKinney,TX,75070,214-491-1893,https://locations.traderjoes.com/tx/mckinney/408/
4,Trader Joe's Dallas Far North (407),32.95087,-96.802823,14856 Preston Rd,Dallas,TX,75254,972-239-3901,https://locations.traderjoes.com/tx/dallas/407/


### Clean Real_Estate_Data.csv

In [4]:
# Find the length of the real estate df
len(real_estate_df)

923159

In [5]:
# Find the number of null values in each column
real_estate_df.isnull().sum()

status               0
price               71
bed             131703
bath            115192
acre_lot        273623
full_address         0
street            2138
city                74
state                0
zip_code           205
house_size      297843
sold_date       466763
dtype: int64

In [6]:
# Find unique values for the status column
real_estate_df.status.unique()

array(['for_sale', 'ready_to_build'], dtype=object)

In [7]:
# Drop status, street, and sold date columns
real_estate_df = real_estate_df.drop(["status", "street", "sold_date"], axis=1)
real_estate_df.head()

Unnamed: 0,price,bed,bath,acre_lot,full_address,city,state,zip_code,house_size
0,105000.0,3.0,2.0,0.12,"Sector Yahuecas Titulo # V84, Adjuntas, PR, 00601",Adjuntas,Puerto Rico,601.0,920.0
1,80000.0,4.0,2.0,0.08,"Km 78 9 Carr # 135, Adjuntas, PR, 00601",Adjuntas,Puerto Rico,601.0,1527.0
2,67000.0,2.0,1.0,0.15,"556G 556-G 16 St, Juana Diaz, PR, 00795",Juana Diaz,Puerto Rico,795.0,748.0
3,145000.0,4.0,2.0,0.1,"R5 Comunidad El Paraso Calle De Oro R-5 Ponce,...",Ponce,Puerto Rico,731.0,1800.0
4,65000.0,6.0,2.0,0.05,"14 Navarro, Mayaguez, PR, 00680",Mayaguez,Puerto Rico,680.0,


In [8]:
# Drop rows that are null for price, city, zip code
# real_estate_df = real_estate_df.dropna(subset=['price', 'city', 'zip_code'])

In [9]:
# Find average for bed, bath, acre_lot, and house_size
# real_estate_df.mean()

In [10]:
# Fill NAs with average for bed, bath, acre-lot, and house_size
# real_estate_df.fillna(real_estate_df.mean())

In [11]:
# Drop all NAs (instead of filling with mean as above)
real_estate_df = real_estate_df.dropna()

In [12]:
# Preview dataframe
real_estate_df.head()

Unnamed: 0,price,bed,bath,acre_lot,full_address,city,state,zip_code,house_size
0,105000.0,3.0,2.0,0.12,"Sector Yahuecas Titulo # V84, Adjuntas, PR, 00601",Adjuntas,Puerto Rico,601.0,920.0
1,80000.0,4.0,2.0,0.08,"Km 78 9 Carr # 135, Adjuntas, PR, 00601",Adjuntas,Puerto Rico,601.0,1527.0
2,67000.0,2.0,1.0,0.15,"556G 556-G 16 St, Juana Diaz, PR, 00795",Juana Diaz,Puerto Rico,795.0,748.0
3,145000.0,4.0,2.0,0.1,"R5 Comunidad El Paraso Calle De Oro R-5 Ponce,...",Ponce,Puerto Rico,731.0,1800.0
5,179000.0,4.0,3.0,0.46,"Bo Calabazas San Sebastian, San Sebastian, PR,...",San Sebastian,Puerto Rico,612.0,2520.0


In [13]:
# Checking length after dropping NAs
len(real_estate_df)

421227

In [14]:
# Find unique values for the state column
real_estate_df.state.unique()

array(['Puerto Rico', 'Virgin Islands', 'Massachusetts', 'Connecticut',
       'New Jersey', 'New York', 'New Hampshire', 'Vermont',
       'Rhode Island', 'Wyoming', 'Maine', 'Pennsylvania',
       'West Virginia', 'Delaware'], dtype=object)

In [15]:
# Dropping states with real estate sales that do not have a TJs

real_estate_df = real_estate_df[real_estate_df.state != "Puerto Rico"]
real_estate_df = real_estate_df[real_estate_df.state != "Virgin Islands"]
real_estate_df = real_estate_df[real_estate_df.state != "Wyoming"]
real_estate_df = real_estate_df[real_estate_df.state != "West Virginia"]
real_estate_df.head()

Unnamed: 0,price,bed,bath,acre_lot,full_address,city,state,zip_code,house_size
24231,180000.0,2.0,1.0,0.34,"23 Moore St, Agawam, MA, 01001",Agawam,Massachusetts,1001.0,676.0
24236,239900.0,3.0,1.0,0.46,"270 South St, Agawam, MA, 01001",Agawam,Massachusetts,1001.0,1196.0
24237,525000.0,3.0,3.0,0.45,"955 River Rd, Agawam, MA, 01001",Agawam,Massachusetts,1001.0,2314.0
24238,289900.0,3.0,2.0,0.36,"82 Harvey Johnson Dr, Agawam, MA, 01001",Agawam,Massachusetts,1001.0,1276.0
24241,275000.0,4.0,2.0,0.11,"6-8 King Ave, Agawam, MA, 01001",Agawam,Massachusetts,1001.0,1732.0


In [16]:
# Checking length after dropping NAs
len(real_estate_df)

405489

In [17]:
# Finding the number of house sales per state
real_estate_df['state'].value_counts()

Massachusetts    103770
New Jersey        73571
Connecticut       68033
New York          51270
Rhode Island      24620
New Hampshire     24454
Maine             23010
Vermont           22205
Pennsylvania      12849
Delaware           1707
Name: state, dtype: int64

### Clean Trader_Joes_Stores.csv

In [18]:
# Drop phone and website columns
trader_joes_df = trader_joes_df.drop(["phone", "website"], axis=1)
trader_joes_df.head()

Unnamed: 0,name,latitude,longitude,street,city,state,zip
0,Trader Joe's Houston - Alabama Theater (426),29.739211,-95.411323,2922 S Shepherd Drive,Houston,TX,77098
1,Trader Joe's Houston - West (431),29.735406,-95.582487,11683 Westheimer Rd,Houston,TX,77077
2,Trader Joe's Houston - S Voss Rd (427),29.752713,-95.501667,1440 South Voss Road,Houston,TX,77057
3,Trader Joe's McKinney (408),33.173697,-96.642874,2851 Craig Dr Ste 100,McKinney,TX,75070
4,Trader Joe's Dallas Far North (407),32.95087,-96.802823,14856 Preston Rd,Dallas,TX,75254


In [19]:
# Find unique values for the state column
trader_joes_df.state.unique()

array(['TX', 'AR', 'VT', 'CO', 'MN', 'ME', 'GA', 'AZ', 'RI', 'CA', 'TN',
       'NE', 'NH', 'WI', 'CT', 'PA', 'LA', 'DE', 'FL', 'UT', 'NJ', 'NM',
       'NC', 'KS', 'MA', 'SC', 'MI', 'NV', 'ID', 'KY', 'MD', 'OK', 'IA',
       'NY', 'OH', 'WA', 'IL', 'OR', 'DC', 'MO', 'VA', 'AL', 'IN'],
      dtype=object)

In [20]:
# Dropping states with TJs that do not have real estate sales
# SHOULD WE DO THIS? 

# trader_joes_states = ['VT', 'ME', 'RI', 'NH', 'CT', 'PA', 'DE', 'NJ', 'MA', 'NY']

# trader_joes_df = trader_joes_df[trader_joes_df.state != (trader_joes_states)]
# trader_joes_df.head

### Adding Latitude & Longitude Values to Real_Estate_Data.csv

In [21]:
# # FOR GEOPY

# # Finding lat and long for each address
# # Documentation: https://geopy.readthedocs.io/en/stable/index.html?highlight=add%20geocode%20to%20dataframe#usage-with-pandas

# # Creating an instance of Nominatim Class
# geolocator = Nominatim(user_agent = "")

# # Applying the rate limiter wrappr
# geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

# # Applying the method to df
# real_estate_df['coordinates'] = real_estate_df['full_address'].apply(lambda loc: tuple(loc.point) if loc else None)

# real_estate_df.head()

In [22]:
# # FOR GOOGLE MAPS

# # Import API key
# gmaps_key = googlemaps.Client(key=g_API)

# # Create columns to store longitude and lattitude
# real_estate_df["longitude"] = None
# real_estate_df["latitude"] = None


# for i in df.index:
#     geocode_obj = gmaps_key.geocode(df.loc[i, "full_address"])
#     try:
#         lat = geocode_obj[0]['geometry']['location']['lat']
#         lon = geocode_obj[0]['geometry']['location']['lng']
#         df.loc[i,'latitude'] = lat
#         df.loc[i,'longitude'] = lon
#     except:
#         lat = None
#         lon = None