# Project 2: Exact, Transform, Load
## Housing market fluctations in four major cities of Texas - Rental Prices

### Group 2 Team Members:
* Sunny Bhatt - ETL Houston Housing Price information
* Waynette Burke - ETL San Antonio and Houston Rental Information
* Ariana Garcia - ETL San Antonio Housing Price Information

In [1]:
#Import Dependency
import pandas as pd
import pymongo

### Establish a connection

In [2]:
# Initialize PyMongo to work with MongoDBs
client = pymongo.MongoClient('mongodb://localhost:27017')

In [3]:
# Define database
db = client.rentalHouse_db

#Creating collection("table") rentalInfo
collection = db.rentalInfo

In [4]:
#Dropping Database and Collection if exist
collection.drop()

### Reading the CSV File for extraction

In [5]:
#CSV File to be extracted
houseRent = "Resources/Metro_zori_sm_month.csv"

#Read in CSV file
rental = pd.read_csv(houseRent)

In [6]:
#View of the imported CSV that was converted to a dataframe
rental.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,2015-03-31,2015-04-30,2015-05-31,2015-06-30,2015-07-31,...,2021-11-30,2021-12-31,2022-01-31,2022-02-28,2022-03-31,2022-04-30,2022-05-31,2022-06-30,2022-07-31,2022-08-31
0,102001,0,United States,country,,1379.35935,1391.073873,1401.530422,1412.396507,1418.136828,...,1926.501816,1940.496249,1948.801048,1966.495998,1982.244729,2008.780414,2030.844688,2055.759958,2076.446338,2089.775788
1,394913,1,"New York, NY",msa,NY,2519.21964,2546.687664,2566.288864,2583.939076,2586.989228,...,2943.9728,2967.725193,2995.408435,3043.726843,3096.093502,3155.667377,3214.734293,3272.366446,3322.750066,3341.555894
2,753899,2,"Los Angeles, CA",msa,CA,1984.656244,1997.410467,2011.539491,2024.112854,2038.44069,...,2800.991074,2818.584204,2835.348908,2859.616662,2891.533834,2929.35269,2963.407518,2989.180066,3009.899282,3023.599107
3,394463,3,"Chicago, IL",msa,IL,1508.976397,1522.121857,1534.084115,1561.01873,1568.264918,...,1826.037645,1833.685056,1846.498235,1862.123404,1881.300441,1904.071425,1929.775478,1951.121693,1970.803363,1979.473043
4,394514,4,"Dallas, TX",msa,TX,1183.20993,1193.705541,1205.09337,1212.123457,1218.724732,...,1720.058793,1730.702831,1741.993929,1752.658761,1762.695569,1784.917301,1811.899478,1846.581556,1870.973903,1881.637168


In [7]:
#Snapshot of the column names to allow for selecting what is required for this analysis
rental.columns

Index(['RegionID', 'SizeRank', 'RegionName', 'RegionType', 'StateName',
       '2015-03-31', '2015-04-30', '2015-05-31', '2015-06-30', '2015-07-31',
       '2015-08-31', '2015-09-30', '2015-10-31', '2015-11-30', '2015-12-31',
       '2016-01-31', '2016-02-29', '2016-03-31', '2016-04-30', '2016-05-31',
       '2016-06-30', '2016-07-31', '2016-08-31', '2016-09-30', '2016-10-31',
       '2016-11-30', '2016-12-31', '2017-01-31', '2017-02-28', '2017-03-31',
       '2017-04-30', '2017-05-31', '2017-06-30', '2017-07-31', '2017-08-31',
       '2017-09-30', '2017-10-31', '2017-11-30', '2017-12-31', '2018-01-31',
       '2018-02-28', '2018-03-31', '2018-04-30', '2018-05-31', '2018-06-30',
       '2018-07-31', '2018-08-31', '2018-09-30', '2018-10-31', '2018-11-30',
       '2018-12-31', '2019-01-31', '2019-02-28', '2019-03-31', '2019-04-30',
       '2019-05-31', '2019-06-30', '2019-07-31', '2019-08-31', '2019-09-30',
       '2019-10-31', '2019-11-30', '2019-12-31', '2020-01-31', '2020-02-29',
    

In [8]:
#Collecting data for the four Texas Cities: Dallas, Austin, San Antonio and Houston
texas_rental = rental.loc[((rental["RegionName"] == "Houston, TX") |
                                 (rental["RegionName"] == "San Antonio, TX")),:]
texas_rental.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,2015-03-31,2015-04-30,2015-05-31,2015-06-30,2015-07-31,...,2021-11-30,2021-12-31,2022-01-31,2022-02-28,2022-03-31,2022-04-30,2022-05-31,2022-06-30,2022-07-31,2022-08-31
5,394692,5,"Houston, TX",msa,TX,1266.485274,1276.757481,1285.512422,1294.301128,1298.504212,...,1545.519707,1556.718879,1556.678364,1564.092063,1569.778502,1584.517983,1598.195718,1612.944456,1625.251711,1632.930432
24,395055,24,"San Antonio, TX",msa,TX,1036.96269,1045.293207,1054.488948,1061.74597,1065.038288,...,1432.520254,1438.136386,1442.200762,1452.625453,1456.879987,1472.082994,1487.993268,1505.683417,1521.742595,1527.103719


In [9]:
#Filtering of the columns: Needing only data for the year 2021
reduced_rental = texas_rental[['RegionName','2021-01-31', '2021-02-28', '2021-03-31', '2021-04-30', '2021-05-31',
       '2021-06-30', '2021-07-31', '2021-08-31', '2021-09-30', '2021-10-31',
       '2021-11-30', '2021-12-31']]

reduced_rental.head()

Unnamed: 0,RegionName,2021-01-31,2021-02-28,2021-03-31,2021-04-30,2021-05-31,2021-06-30,2021-07-31,2021-08-31,2021-09-30,2021-10-31,2021-11-30,2021-12-31
5,"Houston, TX",1385.370824,1381.6966,1387.381925,1400.49202,1427.686387,1459.167326,1491.47515,1518.166502,1530.272085,1540.927833,1545.519707,1556.718879
24,"San Antonio, TX",1244.323792,1246.63651,1256.555388,1272.033387,1293.879012,1319.928451,1350.828315,1387.11605,1415.238489,1430.159529,1432.520254,1438.136386


In [10]:
#Transpose the table data
transposed_house = reduced_rental.transpose()

#Reset the index so that the "Date" field is no longer considered as an Index
transposed_house = transposed_house.reset_index()

#renamed the columns
transposed_house = transposed_house.set_axis(["Date", "Houston,TX", "San Antonio, TX"], axis = "columns") 

# remove the first row which was the old header
transposed_house = transposed_house.drop([0])

#View of the modified dataframe
transposed_house.head()

Unnamed: 0,Date,"Houston,TX","San Antonio, TX"
1,2021-01-31,1385.370824,1244.323792
2,2021-02-28,1381.6966,1246.63651
3,2021-03-31,1387.381925,1256.555388
4,2021-04-30,1400.49202,1272.033387
5,2021-05-31,1427.686387,1293.879012


In [11]:
#Cleaning up the formatting of the data in the dataframe
transposed_house["Date"] =transposed_house["Date"].astype("datetime64")
transposed_house["Houston,TX"] = transposed_house["Houston,TX"].astype(float).map("${:,.2f}".format)
transposed_house["San Antonio, TX"] = transposed_house["San Antonio, TX"].astype(float).map("${:,.2f}".format)

transposed_house.head()

Unnamed: 0,Date,"Houston,TX","San Antonio, TX"
1,2021-01-31,"$1,385.37","$1,244.32"
2,2021-02-28,"$1,381.70","$1,246.64"
3,2021-03-31,"$1,387.38","$1,256.56"
4,2021-04-30,"$1,400.49","$1,272.03"
5,2021-05-31,"$1,427.69","$1,293.88"


### Adding dataframe to MongoDB

In [12]:
#Updating the MondoDB Collection with the data in the dataframe

rental_dict ={}

for row in transposed_house.index:
    #Creating the dictionary for the items in the dataframe
    rental_dict= {"date":transposed_house["Date"][row],
                  "Houton":transposed_house["Houston,TX"][row],
                  "San Antonio":transposed_house["San Antonio, TX"][row]}
    
    #Adding the dictionary to the database collection
    collection.insert_one(rental_dict)


In [13]:
# Display items in MongoDB collection
rentalTable = db.rentalInfo.find()

for entry in rentalTable:
    print(entry)

{'_id': ObjectId('632392508f4fbddbab2f0381'), 'date': datetime.datetime(2021, 1, 31, 0, 0), 'Houton': '$1,385.37', 'San Antonio': '$1,244.32'}
{'_id': ObjectId('632392518f4fbddbab2f0382'), 'date': datetime.datetime(2021, 2, 28, 0, 0), 'Houton': '$1,381.70', 'San Antonio': '$1,246.64'}
{'_id': ObjectId('632392518f4fbddbab2f0383'), 'date': datetime.datetime(2021, 3, 31, 0, 0), 'Houton': '$1,387.38', 'San Antonio': '$1,256.56'}
{'_id': ObjectId('632392518f4fbddbab2f0384'), 'date': datetime.datetime(2021, 4, 30, 0, 0), 'Houton': '$1,400.49', 'San Antonio': '$1,272.03'}
{'_id': ObjectId('632392518f4fbddbab2f0385'), 'date': datetime.datetime(2021, 5, 31, 0, 0), 'Houton': '$1,427.69', 'San Antonio': '$1,293.88'}
{'_id': ObjectId('632392518f4fbddbab2f0386'), 'date': datetime.datetime(2021, 6, 30, 0, 0), 'Houton': '$1,459.17', 'San Antonio': '$1,319.93'}
{'_id': ObjectId('632392518f4fbddbab2f0387'), 'date': datetime.datetime(2021, 7, 31, 0, 0), 'Houton': '$1,491.48', 'San Antonio': '$1,350.83'}