In [4]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns


In [6]:
# Read in the dataset
ufo = pd.read_csv('https://raw.githubusercontent.com/austinlasseter/pandas_visualization/master/data/ufo.csv')
ufo.columns

Index(['City', 'Colors Reported', 'Shape Reported', 'State', 'Time'], dtype='object')

In [None]:
# Check out the time variable
ufo['Time'].describe()

In [None]:
# What type of var is it?
ufo['Time'].dtype

In [None]:
# convert a string to the datetime format
ufo['Time'] = pd.to_datetime(ufo['Time'])
ufo['Time'].dtype # Now it's dt format

In [None]:
ufo['Date'] = ufo['Time'].dt.date
ufo.head()

In [None]:
# Create some additional variables
ufo['Year']=ufo['Time'].dt.year 
ufo['Month']=ufo['Time'].dt.month 
ufo['Day']=ufo['Time'].dt.day 
ufo['Hour']=ufo['Time'].dt.hour 
ufo['Weekday']=ufo['Time'].dt.weekday 
ufo.head()

In [None]:
# also allows you to do datetime "math"
(ufo.Time.max() - ufo.Time.min()).days  

In [None]:
# Create a weekday variable
ufo['weekday_name']=ufo['Weekday'].map({0:'0_Sunday',
                                       1: '1_Monday',
                                       2: '2_Tuesday',
                                       3: '3_Wednesday',
                                       4: '4_Thursday',
                                       5: '5_Friday',
                                       6: '6_Saturday'})
ufo.head()

In [None]:
# Set the Time variable as the dataframe index (This will make plotting timelines easier)
# ufo=ufo.set_index('Time')

In [None]:
# which dates were the most frequent ufo sightings?
ufo['Date'].value_counts(ascending=False).head(5)

In [None]:
# Create a variable to count the number of sightings by date
ufo['sightings']=1
dates = ufo.groupby('Date')['sightings'].sum()
ufo = ufo.drop('sightings', axis=1)
dates.sort_values(ascending=False).head()

In [None]:
# Create a new dataframe of sightings
dates_df = pd.DataFrame(dates)
dates_df= dates_df.reset_index()
dates_df.sort_values('sightings', ascending=False).head()

In [None]:
# Merge the daily number of sightings back into the original datafram
ufo2 = pd.merge(ufo, dates_df, on='Date', how='left')
ufo2.head()

In [None]:
# Trend over years
sns.set(rc={'figure.figsize':(18, 6)})
sns.lineplot(x='Year', y='sightings', data=ufo2);

In [None]:
# Trend over weekday

sns.lineplot(x='weekday_name', y='sightings', data=ufo2, ci=None);

In [None]:
# Trend over time of day

sns.lineplot(x='Hour', y='sightings', data=ufo2);

In [None]:
# Trend over month

sns.lineplot(x='Month', y='sightings', data=ufo2);

In [None]:
# Shorten the color categories
print(ufo['Colors Reported'].value_counts().head(5))
ufo['color']='other'
ufo.loc[ufo['Colors Reported']=='ORANGE', 'color']='orange'
ufo.loc[ufo['Colors Reported']=='RED', 'color']='red'
ufo.loc[ufo['Colors Reported']=='GREEN', 'color']='green'
ufo.head()

In [None]:
# Alternative: create new columns for specific colors
import numpy as np
print(ufo['Colors Reported'].value_counts().head(5))
ufo['orange']=np.where(ufo['Colors Reported']=='ORANGE', 1, 0)
ufo['red']=np.where(ufo['Colors Reported']=='RED', 1, 0)
ufo['green']=np.where(ufo['Colors Reported']=='GREEN', 1, 0)
ufo['other_color']=np.where(ufo['orange']+ufo['red']+ufo['green']==0, 1, 0)
ufo.head()

In [None]:
# Collapse the dataframe on color and date
ufo['sighting']=1
dates = ufo.groupby(['Year', 'color'])['sighting'].sum()
dates_df = pd.DataFrame(dates)
dates_df= dates_df.reset_index()
dates_df.head()
dates_df[dates_df['Year']==2009]

In [None]:
# Restrict the dataset (makes the chart easier to read)
dates_df2=dates_df.loc[(dates_df['Year']>2000) & (dates_df['Year']<2015) & (dates_df['color']!='other')]

In [None]:
# Plot the lines
sns.set(rc={'figure.figsize':(18, 6)})
sns.lineplot(x="Year", y="sighting", hue="color", data=dates_df2);

### All over again, with shapes

In [None]:
# Collapse the dataframe on shape and date
ufo['sighting']=1
dates = ufo.groupby(['Year', 'Shape Reported'])['sighting'].sum()
dates_df = pd.DataFrame(dates)
dates_df= dates_df.reset_index()
dates_df.head()
dates_df[dates_df['Year']==1995].head()

In [None]:
# What are the most frequent shapes?
dates_df['Shape Reported'].value_counts().sort_values(ascending=False).head()

In [None]:
test=pd.read_csv(Path.joinpath(Path.cwd().parent, 'data', 'ufo.csv'))
test.columns

In [None]:
test['Shape Reported'].value_counts().sort_values(ascending=False).head()

In [None]:
# Shorten the list of shapes.
def map_shapes(row):
    if row in ['LIGHT', 'CIRCLE', 'FIREBALL', 'TRIANGLE', 'SPHERE']:
        return row
    else:
        return 'OTHER'

test['Shape']=test['Shape Reported'].apply(map_shapes)
test['Shape'].value_counts()

In [None]:
test['sighting']=1
# convert a string to the datetime format
test['Time'] = pd.to_datetime(ufo['Time'])
test['Year']=test['Time'].dt.year
test.head()

In [None]:
test=test[test['Year']>2000]
test.shape
# test.to_csv('ufo_sightings.csv', index=False)

In [None]:
test.head()

In [None]:
dates = test.groupby(['Year', 'Shape'])['sighting'].sum()
dates_df = pd.DataFrame(dates)
dates_df.head()

In [None]:
# Set the year to the index
dates_df= dates_df.reset_index()

dates_df.head()

In [None]:
dates_df[dates_df['Shape']=='CIRCLE'].groupby(['Year'])['sighting'].mean().plot()
dates_df[dates_df['Shape']=='FIREBALL'].groupby(['Year'])['sighting'].mean().plot()

In [None]:
for shape in ['LIGHT', 'CIRCLE', 'FIREBALL', 'TRIANGLE', 'SPHERE', 'OTHER']:
    dates_df[dates_df['Shape']==shape].groupby(['Year'])['sighting'].mean().plot()

In [None]:
# Plot the lines
sns.set(rc={'figure.figsize':(18, 6)})
sns.lineplot(x="Year", y="sighting", hue="Shape", data=dates_df);

## Summary

In [None]:
ufo = pd.read_csv(Path.joinpath(Path.cwd().parent, 'data', 'ufo.csv'))
# convert a string to the datetime format
ufo['Time'] = pd.to_datetime(ufo['Time'])
ufo['Time'].dtype 
# create new time variables
ufo['Date'] = ufo['Time'].dt.date
ufo['Year']=ufo['Time'].dt.year 
ufo['Month']=ufo['Time'].dt.month 
ufo['Day']=ufo['Time'].dt.day 
ufo['Hour']=ufo['Time'].dt.hour 
ufo['Weekday']=ufo['Time'].dt.weekday 
# also allows you to do datetime "math"
(ufo.Time.max() - ufo.Time.min()).days  
# which dates were the most frequent ufo sightings?
ufo['Date'].value_counts(ascending=False).head(5)
# Create a variable to count the number of sightings by date
ufo['sightings']=1
dates = ufo.groupby('Date')['sightings'].sum()
ufo = ufo.drop('sightings', axis=1)
# Merge the original and grouped datasets
dates_df = pd.DataFrame(dates)
dates_df= dates_df.reset_index()
ufo2 = pd.merge(ufo, dates_df, on='Date', how='left')
# Line charts
sns.set(rc={'figure.figsize':(18, 6)})
sns.lineplot(x='Year', y='sightings', data=ufo2);
sns.lineplot(x='weekday_name', y='sightings', data=ufo2, ci=None);
sns.lineplot(x='Hour', y='sightings', data=ufo2);
sns.lineplot(x='Month', y='sightings', data=ufo2);
# Shorten the color categories
print(ufo['Colors Reported'].value_counts().head(5))
ufo['color']='other'
ufo.loc[ufo['Colors Reported']=='ORANGE', 'color']='orange'
ufo.loc[ufo['Colors Reported']=='RED', 'color']='red'
ufo.loc[ufo['Colors Reported']=='GREEN', 'color']='green'
# Collapse the dataframe on color and date
ufo['sighting']=1
dates = ufo.groupby(['Year', 'color'])['sighting'].sum()
dates_df = pd.DataFrame(dates)
dates_df= dates_df.reset_index()
dates_df.head()
dates_df[dates_df['Year']==2009]
# Plot the lines
sns.set(rc={'figure.figsize':(18, 6)})
sns.lineplot(x="Year", y="sighting", hue="color", data=dates_df2);