# Import the libraries we're going to use

In [None]:
import math
import collections
import urllib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# So the graphs don't appear elsewhere
%matplotlib inline

# Weather stations

In [None]:
urllib.request.urlretrieve('https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt','stations.txt')
                             
  
# FORMAT OF "ghcnd-stations.txt"
# ------------------------------
# Variable   Columns   Type
# ------------------------------
# ID            1-11   Character
# LATITUDE     13-20   Real
# LONGITUDE    22-30   Real
# ELEVATION    32-37   Real
# STATE        39-40   Character
# NAME         42-71   Character
# GSN FLAG     73-75   Character
# HCN/CRN FLAG 77-79   Character
# WMO ID       81-85   Character
# ------------------------------

In [None]:
# parse the stations.txt file
allstations = np.genfromtxt('stations.txt', delimiter=[11,9,10,7,3,31,4,4,6],
                                            usecols=[0,1,2,3,4,5,6,7,8],
                                            names=['id','latitude','longitude','elevation','state','name','gsn','hcn','wmo'],
                                            dtype=['U11','d','d','d','U3','U31','U4','U4','U6'],
                                            autostrip=True)
allstations

In [None]:
len(allstations)

In [None]:
fig = plt.figure(figsize=(15,8))
plt.scatter(allstations['longitude'], allstations['latitude'], s=0.8)
plt.show()

In [None]:
aust = allstations[(allstations['longitude'] > 115) & (allstations['longitude'] < 155)
                   & (allstations['latitude'] < -7) & (allstations['latitude'] > -50)]
print(len(aust))
aust

In [None]:
fig = plt.figure(figsize=(9, 8))
plt.scatter(aust['longitude'], aust['latitude'], s=0.5)
plt.show()

# Plotting Data

In [None]:
data = [np.random.normal(0, std, 100) for std in range(1, 4)]
data

In [None]:
plt.boxplot(data, vert=True,patch_artist=True)
plt.show()

# Pandas "DataFrames"

In [None]:
data = {'Company': ['GOOG', 'GOOG', 'MSFT', 'MSFT', 'FB', 'FB', 'FB', 'GOOG'],
       'Person':['Sam', 'Charlie', 'Amy', 'Vanessa', 'Carl', 'Sarah', 'Bill', 'Cherry'],
       'Sales':[200,210,340,124,205,250, 45, 30], 'Costs':[190,110,300,64,105,450,20, 25]}
data

In [None]:
df = pd.DataFrame(data)
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe().transpose()

In [None]:
df.groupby('Company').describe()

In [None]:
sales = df.groupby('Company')['Sales'].sum()
costs = df.groupby('Company')['Costs'].sum()
(costs, sales)

In [None]:
fig, ax1 = plt.subplots(figsize=(10, 5))
ax1.set_ylim([250,600])
ax1.plot(sales.index, sales, lw=2, color='blue')
ax1.set_ylabel("Sales", fontsize=16, color='blue')
for label in ax1.get_yticklabels():
    label.set_color('blue')

ax2 = ax1.twinx()
ax2.set_ylim([250,600])
ax2.plot(costs.index, costs, lw=2, color='red')
ax2.set_ylabel("Costs", fontsize=16, color='red')
for label in ax2.get_yticklabels():
    label.set_color('red')
    
ax1.text('GOOG', 470, "Sales", color='blue', fontsize=18)    
ax2.text('GOOG', 340, "Costs", color='red', fontsize=18)


In [None]:
# This was not in the original demonstration but to save
# to save the graphic:
fig.savefig('Cost vs Sales.pdf')
fig.savefig('Cost vs Sales.png')

# Seaborn...Wrapper around Matplotlib

In [None]:
import seaborn as sns

In [None]:
sns.boxplot(x="Company", y='Sales', data =df, palette='coolwarm')

# Back to the Weather

In [None]:
allstations[np.char.find(allstations['name'], "CANBERRA") == 0]

In [None]:
urllib.request.urlretrieve('https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/all/ASN00070014.dly', 'AIRPORT.dly') 

# FORMAT OF "*.dly" files
# ------------------------------
# Variable   Columns   Type
# ------------------------------
# ID            1-11   Character
# YEAR         12-15   Integer
# MONTH        16-17   Integer
# ELEMENT      18-21   Character
# VALUE1       22-26   Integer
# MFLAG1       27-27   Character
# QFLAG1       28-28   Character
# SFLAG1       29-29   Character
# VALUE2       30-34   Integer
# MFLAG2       35-35   Character
# QFLAG2       36-36   Character
# SFLAG2       37-37   Character
#   .           .          .
#   .           .          .
#   .           .          .
# VALUE31    262-266   Integer
# MFLAG31    267-267   Character
# QFLAG31    268-268   Character
# SFLAG31    269-269   Character
# ------------------------------


In [None]:
w = np.genfromtxt('AIRPORT.dly',
                   delimiter=[11,4,2,4] + [5,1,1,1]*31,
                   # we will not use the daily flags, so this list becomes
                   # 0, 1, 2, 3, 4, 8, 12, 16, 20, 24...
                   usecols=[0,1,2,3] + list(range(4,4*32,4)),
                   # the names of the daily observations will be day1, day2, day3, ...
                   names=['id','year','month','element'] + [f'day{i}' for i in range(1,32)],
                   dtype=['U11','i','i','U4'] + ['d']*31,
                   autostrip=True)
w

In [None]:
df = pd.DataFrame(w)
df

In [None]:
    # "melt" the daily observations into one record per daily observation,
    # storing the column name in 'day'
    df = pd.melt(df, id_vars=['id','year','month','element'], var_name='day', value_name='value')
    
    # throw away null observations
    df = df[df.value != -9999]
    df

In [None]:
# keep only min/max temperatures, precipitation, and snow
df = df[df.element.isin(['TMAX','TMIN','PRCP','SNOW'])]
df

In [None]:
df['day'] = df.day.apply(lambda x: int(x[3:]))
df

In [None]:
df.info()

In [None]:
df['date'] = pd.to_datetime(df[['year','month','day']], errors='ignore')
df

In [None]:
# keep only year, date, element, and value
df = df[['date','element','value']]
df

In [None]:
# restructure the DataFrame so that different elements for the same day appear in the same row
# (basically the opposite of melt)
df = df.pivot(index='date', columns='element')['value']
df.columns.name = None    
df

In [None]:
df.info()

In [None]:
# Convert temperatures to degrees    
df['TMIN'] /= 10.0
df['TMAX'] /= 10.0
df

In [None]:
df['year'] = pd.DatetimeIndex(df.index).year
df

In [None]:
df2001 = df[df["year"] == 2001]
df2001 = df2001[['TMAX','TMIN','PRCP']]
df2001

In [None]:
fig, axes = plt.subplots(figsize=(14, 6))
ax = sns.lineplot(data=df2001)

In [None]:
df.hist(figsize=(16, 8), bins=50, xlabelsize=8, ylabelsize=8)
plt.show()

In [None]:
sns.pairplot(df)
plt.show()

# That's all folks!