# Introduction: It's about time 
Thank you for checking out the code for: 

> Hogan, Bernie (2022, forthcoming) _From Social Science to Data Science_. Sage Publications. 

This notebook contains the code from the book, along with the headers and additional author notes that are not in the book as a way to help navigate the code. You can run this notebook in a browser by clicking the buttons below. 
    
The version that is uploaded to GitHub should have all the results pasted, but the best way to follow along is to clear all outputs and then start afresh. To do this in Jupyter go the menu and select "Kernel -> Restart Kernel and Clear all Outputs...". To do this on Google Colab go to the menu and select "Edit -> Clear all outputs".
    
The most up-to-date version of this code can be found at https://www.github.com/berniehogan/fsstds 

Additional resources and teaching materials can be found on Sage's forthcoming website for this book. 

All code for the book and derivative code on the book's repository is released open source under the  MIT license. 
    

[![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/berniehogan/fsstds/main?filepath=chapters%2FCh.12.TimeSeries.ipynb)[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berniehogan/fsstds/blob/main/chapters/Ch.12.TimeSeries.ipynb)

# Dates and the datetime module 

In [1]:
from datetime import datetime
import calendar

In [2]:
now = datetime.now()

print(now)
print(now.timetuple())
print(calendar.timegm(now.timetuple()))

2022-05-10 07:58:44.518171
time.struct_time(tm_year=2022, tm_mon=5, tm_mday=10, tm_hour=7, tm_min=58, tm_sec=44, tm_wday=1, tm_yday=130, tm_isdst=-1)
1652169524


## Parsing Time

In [3]:
# Taken from https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object.html

tweet = {
 "created_at":"Thu Apr 06 15:24:15 +0000 2017",
 "id": 850006245121695744,
 "id_str": "850006245121695744",
 "text": "1/ Today we’re sharing our vision for the future of the Twitter API platform!nhttps://t.co/XweGngmxlP",
 "user": {},
 "entities": {}
}

print(type(tweet['created_at']),tweet['created_at'])

<class 'str'> Thu Apr 06 15:24:15 +0000 2017


In [4]:
import pandas as pd

In [5]:
pd.to_datetime(tweet["created_at"])

Timestamp('2017-04-06 15:24:15+0000', tz='UTC')

In [6]:
tweet_date = "Thu Apr 06 15:24:15 +0000 2017"
print(f"The original date was formatted as: {tweet_date}")

tweet_stamp = datetime.strptime(tweet_date, "%a %b %d %H:%M:%S %z %Y")
print("We can format it differenly, such as:", 
      datetime.strftime(tweet_stamp, '%Y--%d--%b--%a %H and %M and %S %z'))

The original date was formatted as: Thu Apr 06 15:24:15 +0000 2017
We can format it differenly, such as: 2017--06--Apr--Thu 15 and 24 and 15 +0000


In [7]:
print(tweet_stamp.year,tweet_stamp.month,tweet_stamp.day,sep="--")

2017--4--6


## Timezones 

In [8]:
from datetime import timezone 

In [9]:
try:
    print(datetime.now() - tweet_stamp)
except:
    print("Cannot subtract time zone aware from non aware data")
    print(datetime.now(timezone.utc) - tweet_stamp)

Cannot subtract time zone aware from non aware data
1859 days, 15:35:26.902178


## Localisation and time 

In [10]:
import locale 
from calendar import day_abbr 

In [64]:
locale.setlocale(locale.LC_ALL, 'fr_FR.UTF-8') # French
print([day_abbr[i] for i in range(7)])

locale.setlocale(locale.LC_ALL, '') # Local (English)
print([day_abbr[i] for i in range(7)])

['Lun', 'Mar', 'Mer', 'Jeu', 'Ven', 'Sam', 'Dim']
['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']


# Revisiting the Movie Stack Exchange data

In [12]:
import pickle
from pathlib import Path

In [13]:
data_dir = Path().cwd().parent / "data"
pickle_file = data_dir / "movies_stack_df.pkl"

if pickle_file.exists():
    stack_df = pickle.load(open(pickle_file ,'rb'))
    print(len(stack_df))
else:
    print("Please download and clean the Stack_df data as done in ",
          "Chapter 10. See https://archive.org/download/stackexchange")

61184


In [14]:
print(stack_df["CreationDate"][0], type(stack_df["CreationDate"][0]))

2011-11-30 19:15:54.070000 <class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [15]:
import matplotlib.pyplot as plt 
import seaborn as sns 
%config InlineBackend.figure_format = 'svg'

In [None]:
sns.histplot(stack_df["CreationDate"])

plt.show()

# Pandas Datetime feature extraction

In [17]:
result = stack_df["CreationDate"].dt.year
display(result.sample(5,random_state=12345))

Id
74465     2017
105929    2019
89621     2018
55084     2016
23203     2014
Name: CreationDate, dtype: int64

In [None]:
stack_df["CreationDate"].dt.year.hist();

plt.show()

In [None]:
month_vals = stack_df["CreationDate"].dt.month.value_counts().sort_index()
month_vals.index = [calendar.month_abbr[x] for x in month_vals.index]

month_vals.plot(kind="bar");

plt.show()

In [None]:
hour_vals = stack_df["CreationDate"].dt.hour.value_counts().sort_index()
hour_vals.index = [ f"{x}h" for x in hour_vals.index]
hour_vals.plot(kind="bar")

plt.show()

In [21]:
stack_df["CreationDate"].dt.strftime("%d %b").sample(5,random_state=12345)

Id
74465     08 Jun
105929    27 Dec
89621     11 Jun
55084     06 Jun
23203     19 Jul
Name: CreationDate, dtype: object

# Resampling as a way to group by time period

In [None]:
stack_df.resample('M', on="CreationDate").sum().head(5).

In [None]:
display(stack_df[["CreationDate","Score"]]
        .resample('Y',on="CreationDate")
        .max().head())

In [None]:
fig, (ax1,ax2,ax3) = plt.subplots(3,1)

sns.histplot(stack_df["CreationDate"],
             ax=ax1).set_xlabel(None)#.set_ylabel(None)
ax1.set_ylabel(None) 

stack_df[["CreationDate"]].resample('Y', 
    on="CreationDate").count().plot(ax=ax2,legend=False).set_xlabel(None)
ax2.set_ylabel("Count") 

stack_df[["CreationDate"]].resample('M',
    on="CreationDate").count().plot(ax=ax3,legend=False)

plt.tight_layout()

plt.show()

# Slicing and the Datetimeindex in Pandas

In [None]:
time_mask = (stack_df["CreationDate"]>= '2015-03-14') & \
            (stack_df["CreationDate"]<  '2015-03-15')

stack_df.iloc[:,:5][time_mask].head()

In [45]:
# Counting posts within a two-year range
mask2 = (stack_df["CreationDate"] >= '2015') & \
        (stack_df["CreationDate"] <  '2017')

print(len(stack_df[mask2]))

16719


In [None]:
time_df = stack_df.copy()
time_df['Id'] = stack_df.index
time_df.set_index('CreationDate',inplace=True)
time_df[["Id","CleanBody","Score"]].sample(5, random_state=1984)

In [53]:
print(len(time_df.loc["2015-06"]),
      len(time_df.loc["2015-07"]),
      len(time_df.loc["2018"]),
      len(time_df.sort_index().loc["2018-03-14":"2018-03-15"]),sep="\n")

607
600
6496
30


# Moving window in data 

In [None]:
time_df["Score7d"] = time_df["Score"].rolling(7, center=True).mean()
time_df[["Score","Score7d"]].head(8).style.format({"Score7d":"{:.2f}"})

In [None]:
time_df_mnth = time_df[["Score","CommentCount"]].resample('M').mean()
display(time_df_mnth.head(5))

## Missing data in a rolling window 

In [57]:
daily_df = time_df[["Score"]].resample('D').mean()

daily_df["Score7d"] = daily_df["Score"].rolling(7, center=True).mean()
daily_df["Score30d"] = daily_df["Score"].rolling(30, center=True).mean()
daily_df["Score60d"] = daily_df["Score"].rolling(60, center=True).mean()

In [None]:
fig, (ax1,ax2,ax3) = plt.subplots(3,1)

daily_df["Score7d"].plot(ax=ax1, legend=True, sharex=ax3)
daily_df["Score30d"].plot(ax=ax2, legend=True, sharex=ax3)
daily_df["Score60d"].plot(ax=ax3, legend=True)

ax3.set_xlabel("Average score of posts over time")
plt.tight_layout()

plt.show()

In [None]:
(daily_df["Score"]
 .rolling(60, center=True, min_periods=55)
 .mean()
 .plot(legend=True,ylabel="Average post score", xlabel="year"));

plt.show()

# Summary 

# Further explorations 

# Extensions and reflections 