# Time Series Exploratory Notebook

In [None]:
# load dependencies
import matplotlib.pyplot as plt
import statsmodels
import pandas as pd

## Historic Data

### Import data

In [None]:
# historic dataset
path = "../data/raw/"
file = "100014.csv"
df = pd.read_csv(path + file, sep=";")

### Dataset info

In [None]:
df.info()

**Description:** There are 11 columns with information in the dataset. Most of them are of dtype object. Two columns `Anzahl frei` and `Total Plätze` are of type int64 which is a good sign since we only expect nummeric characters there.

### Columns with na values

In [None]:
df.isna().any()

**Description:** There are no missing values in any columns of the historic dataset. All entries seem to be complete.

### Nr of garages in the dataset

In [None]:
df[["id", "id2", "title"]].drop_duplicates().reset_index(drop=True)

**Description:** There are a total of 16 distinct garages in the dataset. 

### Nr of entries per garage

In [None]:
df[["title", "published"]].groupby("title").count()

**Description:** There are only two garage: "Parkhaus Kunstmuseum" and "Parkhaus City" which contain less entries than the rest.

### Nr unique of `Total Parkplätze`

In [None]:
df[["title", "total"]].groupby("title").nunique()

**Description:** Most garages have a constant number of parking spots for the duration of the dataset. However there are also two garages with 2 different totals: "Parkhaus Bad. Bahnhof" and "Parkhaus Claramatte". In a next step, we want to investigate the reason for this.

In [None]:
def show_total_spots_timeline(df, name, last_n_entries=None):
    '''
    Creates a lineplot with the total nr of available parking spots for as single garage
    '''
    df = df.copy()
    df = df[df["title"]==name][["published", "total"]].sort_values(by = "published").reset_index(drop=True)
    
    df["published"] = pd.to_datetime(df["published"])
    
    if last_n_entries:
        df = df[-last_n_entries:]
        
    df = df.sort_values(by="published")
    
    plt.figure(figsize=(16,5))
    plt.title(f"Total nr of parking spots for: {name}")
    plt.plot(df["published"], df["total"])
    plt.xlabel("Timeline")
    plt.ylabel("Total spots")
    plt.show()

### Timeline for "Parkhaus Bad. Bahnhof"

In [None]:
show_total_spots_timeline(df, name = "Parkhaus Bad. Bahnhof")    
show_total_spots_timeline(df, name = "Parkhaus Bad. Bahnhof", last_n_entries=7*24)

### Timeline for "Parkhaus Claramatte"

In [None]:
show_total_spots_timeline(df, name = "Parkhaus Claramatte")    
show_total_spots_timeline(df, name = "Parkhaus Claramatte", last_n_entries=7*24)

The parking garages can sometimes change the number of public parking spaces. The exact reason is unknown. The following link has more information about it.

https://data.bs.ch/explore/dataset/100014/comments/?disjunctive.title&sort=published&dataChart=eyJxdWVyaWVzIjpbeyJjaGFydHMiOlt7InR5cGUiOiJsaW5lIiwiZnVuYyI6Ik1JTiIsInlBeGlzIjoiZnJlZSIsInNjaWVudGlmaWNEaXNwbGF5Ijp0cnVlLCJjb2xvciI6InJhbmdlLUFjY2VudCJ9XSwieEF4aXMiOiJwdWJsaXNoZWQiLCJtYXhwb2ludHMiOiIiLCJ0aW1lc2NhbGUiOiJob3VyIiwic29ydCI6IiIsInNlcmllc0JyZWFrZG93biI6InRpdGxlIiwiY29uZmlnIjp7ImRhdGFzZXQiOiIxMDAwMTQiLCJvcHRpb25zIjp7ImRpc2p1bmN0aXZlLnRpdGxlIjp0cnVlLCJzb3J0IjoicHVibGlzaGVkIn19fV0sImRpc3BsYXlMZWdlbmQiOnRydWUsImFsaWduTW9udGgiOnRydWV9 



### delete not used columns

In [None]:
df

In [None]:
df_modified = df[['published', 'free', 'id2', 'total', 'geo_point_2d']].copy()

### Datatypes

In [None]:
df_modified.published = pd.to_datetime(df_modified.published)
print('date check')
df_modified['id2'] = df_modified['id2'].apply(lambda x: str(x))
print('string check')
df_modified['latitude'] = df_modified.geo_point_2d.apply(lambda x: float(str(x).strip().split(',')[0]))

df_modified['longitude'] = df_modified.geo_point_2d.apply(lambda x: float(str(x).strip().split(',')[1]))

In [None]:
df_modified.info()
df_modified

### repeated lines

In [None]:
df_modified.duplicated().sum()

There are no duplicated rows

### checking data with logical conditions 

**All dates should be consecutive**

In [None]:
date_number = df_modified.groupby('published').size()
date_number

In [None]:
date_number

In [None]:
all = date_number.count()
a16 = (date_number==16).sum()
a15 = (date_number==15).sum()
print((a16 + a15) == all)

For every date we have 15 or 16 entries. This means that for every date are enough entries.

In [None]:
leng = len([i for i in date_number.index])
plt.scatter(y=range(leng), x=[date_number.index])

The dates are consecutive

**Number of free parking spots should always be less or equal to the total number**

In [None]:
df_modified.groupby(['id2', 'total']).size()

Two garages don't have data for the whole time (city and kunstmuseum). The other have data for the whole period. The total number of places stays the same except the two garages we analysed above.

In [None]:
a = {}
for i in df_modified.index:
    if df_modified.loc[i, 'free'] > df_modified.loc[i, 'total']:
        a[i] = [df_modified.loc[i, 'free'], df_modified.loc[i, 'total'], ]
print(len(a))

In [None]:
too_much_free_spots = [i[0] - i[1] for i in a.values()]
plt.title('surplus of free parking spaces')
plt.boxplot(too_much_free_spots)
plt.ylabel('how much more parking spots than total available')
plt.show()

There are rows that have more free parking spots than total parking spots.

**all parkings should have the same coordinates**

In [None]:
df_modified.groupby(['id2', 'geo_point_2d']).size()

For every garage is only one coordinate.

### Timeseries for every garage

In [None]:
# n_weeks is approximately the number of weeks to be shown
n_weeks=1
garages = df_modified.id2.unique()
for i in garages:
    g1 = df[df['id2'] == i]
    plt.plot(g1.published[-188*n_weeks:], g1.free[-188*n_weeks:])
    print('check')
    plt.title('Timeseries for ' + i)
    plt.xlabel('date')
    plt.ylabel('Free parking spots')
    plt.show()