In [1]:
# SQLAlchemy
from sqlalchemy import create_engine
from sqlalchemy import inspect

# Pandas
import pandas as pd
from datetime import datetime

import os
import numpy as np

import psycopg2

from config import db_password

In [2]:
# Load CSV into a dataFrame
Temperature_df = pd.read_csv('Resources/35-tavg.csv', skiprows=3)  
Temperature_df.head(20)

Unnamed: 0,Location ID,Location,Date,Value,Rank,Anomaly (1901-2000 base period),1901-2000 Mean
0,OR-001,Baker County,189501,22.4,36,-1.9,24.3
1,OR-001,Baker County,189502,25.9,27,-3.3,29.2
2,OR-001,Baker County,189503,31.9,14,-3.5,35.4
3,OR-001,Baker County,189504,44.3,93,1.6,42.7
4,OR-001,Baker County,189505,48.9,39,-1.3,50.2
5,OR-001,Baker County,189506,55.5,32,-1.9,57.4
6,OR-001,Baker County,189507,62.8,9,-3.2,66.0
7,OR-001,Baker County,189508,63.9,40,-1.1,65.0
8,OR-001,Baker County,189509,50.2,1,-6.5,56.7
9,OR-001,Baker County,189510,47.3,94,1.3,46.0


In [3]:
# Include only years 1992 to 2015
Temperature_df = Temperature_df.loc[(Temperature_df['Date'] >= 199201) & (Temperature_df['Date'] <= 201512)]
Temperature_df.head()

Unnamed: 0,Location ID,Location,Date,Value,Rank,Anomaly (1901-2000 base period),1901-2000 Mean
1164,OR-001,Baker County,199201,28.8,98,4.5,24.3
1165,OR-001,Baker County,199202,37.6,125,8.4,29.2
1166,OR-001,Baker County,199203,43.4,128,8.0,35.4
1167,OR-001,Baker County,199204,46.5,118,3.8,42.7
1168,OR-001,Baker County,199205,56.9,127,6.7,50.2


In [4]:
Temperature_df.dtypes

Location ID                         object
Location                            object
Date                                 int64
Value                              float64
Rank                                 int64
Anomaly (1901-2000 base period)    float64
1901-2000 Mean                     float64
dtype: object

In [5]:
# create county code column based on Location ID
Temperature_df["county_code"] = Temperature_df["Location ID"].str.slice(3,)
Temperature_df.head()

Unnamed: 0,Location ID,Location,Date,Value,Rank,Anomaly (1901-2000 base period),1901-2000 Mean,county_code
1164,OR-001,Baker County,199201,28.8,98,4.5,24.3,1
1165,OR-001,Baker County,199202,37.6,125,8.4,29.2,1
1166,OR-001,Baker County,199203,43.4,128,8.0,35.4,1
1167,OR-001,Baker County,199204,46.5,118,3.8,42.7,1
1168,OR-001,Baker County,199205,56.9,127,6.7,50.2,1


In [6]:
# Convert date from int to a date
Temperature_df["Date"] = pd.to_datetime(Temperature_df["Date"], format = '%Y%m')
Temperature_df.head()

Unnamed: 0,Location ID,Location,Date,Value,Rank,Anomaly (1901-2000 base period),1901-2000 Mean,county_code
1164,OR-001,Baker County,1992-01-01,28.8,98,4.5,24.3,1
1165,OR-001,Baker County,1992-02-01,37.6,125,8.4,29.2,1
1166,OR-001,Baker County,1992-03-01,43.4,128,8.0,35.4,1
1167,OR-001,Baker County,1992-04-01,46.5,118,3.8,42.7,1
1168,OR-001,Baker County,1992-05-01,56.9,127,6.7,50.2,1


In [7]:
# Drop unneeded columns
Temperature_df.drop(["Anomaly (1901-2000 base period)", "1901-2000 Mean", "Rank"], axis=1, inplace=True)

In [8]:
# Create a year and month column
Temperature_df["year"] = Temperature_df["Date"].dt.year
Temperature_df["month"] = Temperature_df["Date"].dt.month
Temperature_df.tail()

Unnamed: 0,Location ID,Location,Date,Value,county_code,year,month
54927,OR-071,Yamhill County,2015-08-01,68.3,71,2015,8
54928,OR-071,Yamhill County,2015-09-01,60.6,71,2015,9
54929,OR-071,Yamhill County,2015-10-01,57.6,71,2015,10
54930,OR-071,Yamhill County,2015-11-01,43.4,71,2015,11
54931,OR-071,Yamhill County,2015-12-01,41.7,71,2015,12


In [9]:
# Drop Date, Location ID, and Location columns
Temperature_df.drop(["Date", "Location ID", "Location"], axis=1, inplace=True)

In [10]:
# Rename value column to temperature 
Temperature_df.rename(columns={"Value" : "avg_temp"}, inplace=True)
Temperature_df.head()

Unnamed: 0,avg_temp,county_code,year,month
1164,28.8,1,1992,1
1165,37.6,1,1992,2
1166,43.4,1,1992,3
1167,46.5,1,1992,4
1168,56.9,1,1992,5


In [11]:
# check for nulls
Temperature_df.count()

avg_temp       10368
county_code    10368
year           10368
month          10368
dtype: int64

In [12]:
#write dataframe to a csv file in the Resource folder
os.makedirs('Resources', exist_ok=True)
Temperature_df.to_csv('Resources/oregon_temps.csv',index=False)

In [13]:
# Connect to the RDS postgres db and add the dataframes to postgres
db_string = f"postgresql://postgres:{db_password}@fires.crlyg1rjxxj2.us-west-2.rds.amazonaws.com:5432/postgres"
engine = create_engine(db_string)

In [14]:
# Write precipitation data to postgres table 
Temperature_df.to_sql(name="temperatures", index=False, if_exists="replace", con=engine)