In [4]:
from __future__ import division
import itertools
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import pandas as pd
import math
from sklearn import metrics
from random import randint
from matplotlib import style
import seaborn as sns
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

%matplotlib inline

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
def evaluate(actual, predictions, output=True):
    mse = metrics.mean_squared_error(actual, predictions)
    rmse = math.sqrt(mse)

    if output:
        print('MSE:  {}'.format(mse))
        print('RMSE: {}'.format(rmse))
    else:
        return mse, rmse    

def plot_and_eval(predictions, actual, metric_fmt='{:.2f}', linewidth=4):
    if type(predictions) is not list:
        predictions = [predictions]

    plt.figure(figsize=(16, 8))
    plt.plot(train,label='Train')
    plt.plot(test, label='Test')

    for yhat in predictions:
        mse, rmse = evaluate(actual, yhat, output=False)        
        label = f'{yhat.name}'
        if len(predictions) > 1:
            label = f'{label} -- MSE: {metric_fmt} RMSE: {metric_fmt}'.format(mse, rmse)
        plt.plot(yhat, label=label, linewidth=linewidth)

    if len(predictions) == 1:
        label = f'{label} -- MSE: {metric_fmt} RMSE: {metric_fmt}'.format(mse, rmse)
        plt.title(label)

    plt.legend(loc='best')
    plt.show()    

## Discrete data + probability


In [3]:
colnames=['ip', 'timestamp', 'request_method', 'status', 'size',
          'destination', 'request_agent']
df_orig = pd.read_csv('http://python.zach.lol/access.log',          
                 engine='python',
                 header=None,
                 index_col=False,
                 names=colnames,
                 sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
                 na_values='"-"',
                 usecols=[0, 3, 4, 5, 6, 7, 8]
)

new = pd.DataFrame([["95.31.18.119", "[21/Apr/2019:10:02:41+0000]", 
                     "GET /api/v1/items/HTTP/1.1", 200, 1153005, np.nan, 
                     "python-requests/2.21.0"],
                    ["95.31.16.121", "[17/Apr/2019:19:36:41+0000]", 
                     "GET /api/v1/sales?page=79/HTTP/1.1", 301, 1005, np.nan, 
                     "python-requests/2.21.0"],
                    ["97.105.15.120", "[18/Apr/2019:19:42:41+0000]", 
                     "GET /api/v1/sales?page=79/HTTP/1.1", 301, 2560, np.nan, 
                     "python-requests/2.21.0"],
                    ["97.105.19.58", "[19/Apr/2019:19:42:41+0000]", 
                     "GET /api/v1/sales?page=79/HTTP/1.1", 200, 2056327, np.nan, 
                     "python-requests/2.21.0"]], columns=colnames)

df = df_orig.append(new)

df.head()

Unnamed: 0,ip,timestamp,request_method,status,size,destination,request_agent
0,97.105.19.58,[16/Apr/2019:19:34:42 +0000],"""GET /api/v1/sales?page=81 HTTP/1.1""",200,512495,,"""python-requests/2.21.0"""
1,97.105.19.58,[16/Apr/2019:19:34:42 +0000],"""GET /api/v1/items HTTP/1.1""",200,3561,,"""python-requests/2.21.0"""
2,97.105.19.58,[16/Apr/2019:19:34:44 +0000],"""GET /api/v1/sales?page=82 HTTP/1.1""",200,510103,,"""python-requests/2.21.0"""
3,97.105.19.58,[16/Apr/2019:19:34:46 +0000],"""GET /api/v1/sales?page=83 HTTP/1.1""",200,510003,,"""python-requests/2.21.0"""
4,97.105.19.58,[16/Apr/2019:19:34:48 +0000],"""GET /api/v1/sales?page=84 HTTP/1.1""",200,511963,,"""python-requests/2.21.0"""


In [None]:
df.timestamp = df.timestamp.str.replace(r'(\[|\])', '', regex=True)
df.timestamp= pd.to_datetime(df.timestamp.str.replace(':', ' ', 1)) 
df = df.set_index('timestamp')

df.head()

In [None]:
for col in ['request_method', 'request_agent', 'destination']:
    df[col] = df[col].str.replace('"', '')

df['request_method'] = df.request_method.str.replace(r'\?page=[0-9]+', '', regex=True)

df.head()

In [None]:
rm_df = pd.DataFrame(df.request_method.value_counts(dropna=False)).reset_index().\
                rename(index=str, columns={'index': 'request_method', 'request_method': 'rm_count'})

rm_df2 = pd.DataFrame(df.request_method.value_counts(dropna=False)/df.request_method.count()).reset_index().\
                rename(index=str, columns={'index': 'request_method', 'request_method': 'rm_proba'})

rm_df = rm_df.merge(rm_df2)

rm_df[rm_df.rm_proba < .01]


In [None]:
print(len(rm_df))

print(rm_df.tail(10))

plt.figure(figsize=(12, 4))
splot = sns.barplot(data=rm_df, x = 'request_method', y = 'rm_count', ci = None)
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.0f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', xytext = (0, 15), 
                   textcoords = 'offset points'
                   )
    plt.xticks(rotation='vertical')


In [None]:
train = df['2019-04-16 19:34:42':'2019-04-17 12:55:14'][['ip','request_method','status','size','destination','request_agent']]


In [None]:
rm_df = pd.DataFrame(train.request_method.value_counts(dropna=False)/train.request_method.count()).reset_index().\
                rename(index=str, columns={'index': 'request_method', 'request_method': 'rm_proba'})


In [None]:
rm_df

In [None]:
df = df.reset_index().merge(rm_df, on=['request_method'], how='left').fillna(value=0).set_index('timestamp')
df.rm_proba.value_counts()


In [None]:
rm_probs = train.groupby('request_method').size().div(len(df))

request_given_ip = pd.DataFrame(train.groupby(['request_method', 'ip']).\
                               size().div(len(train)).\
                               div(rm_probs, 
                                   axis=0, 
                                   level='ip').\
                               reset_index().\
                               rename(index=str, 
                                      columns={0: 'proba_request_given_ip'})
                              )

In [None]:
rm_status_count = pd.DataFrame(train.groupby(['request_method', 'ip'])['status'].\
                                count().reset_index())

rm_status = request_given_ip.merge(rm_status_count)


In [None]:
df = df.reset_index().merge(rm_status, on=['request_method', 'status'], how='left').fillna(value=0).set_index('timestamp')


In [None]:
df.head()

In [None]:
plt.scatter(df.proba_request_given_ip, df.rm_proba)


## Time series + EMA


In [None]:
df = pd.read_csv('anonymized-curriculum-access.txt', header=None, sep=' ')

In [None]:
df.head()

In [None]:
df['datetime'] = df[0] + ' ' + df[1]

In [None]:
df.drop(columns=[0,1], inplace=True)

In [None]:
df.index = pd.to_datetime(df.datetime)
df.drop(columns='datetime', inplace=True)

In [None]:
df.rename(columns={2: 'address', 3: 'id', 4: 'cohort_id', 5:'ip_address'}, inplace=True)

In [None]:
df.fillna(0, inplace=True)

In [None]:
df.head()

In [None]:
df24 = df[df['cohort_id']==24]

df24['id'].resample('W').agg('count').plot()

In [None]:
df.dtypes

In [None]:
train = df24[:'2018-09-29']['id'].resample('W').agg('count')
test = df24['2018-09-30':]['id'].resample('W').agg('count')
plt.plot(train)
plt.plot(test)


In [None]:
# Calculating the short-window simple moving average
short_rolling = train.rolling(window=4).mean()

# Calculating the long-window simple moving average
long_rolling = train.rolling(window=8).mean()

In [None]:
fig, ax = plt.subplots(figsize=(12,4))

ax.plot(train.index, 
        train,
        label='id cnt')

ax.plot(short_rolling.index, 
        short_rolling, 
        label = '4 SMA')
ax.plot(long_rolling.index, 
        long_rolling, 
        label = '8 SMA')

ax.legend(loc='best')
ax.set_ylabel('id total')
# ax.xaxis.(rotate=90)
# ax.xaxis.set_major_formatter(my_datetime_fmt)

In [None]:
# Using Pandas to calculate a 2 hour span EMA. 
# adjust=False specifies that we are interested in the 
# recursive calculation mode.
ema_short = train.ewm(span=12, adjust=False).mean()
ema_short[0:3]

ema_long = train.ewm(span=12, adjust=False).mean()
ema_long[0:3]

In [None]:
fig, ax = plt.subplots(figsize=(12,4))

ax.plot(train.index, 
        train,
        label='is cnt')

ax.plot(short_rolling.index, 
        short_rolling, 
        label = '4 SMA')
ax.plot(long_rolling.index, 
        ema_short, 
        label = 'Span 4 EMA')
ax.plot(long_rolling.index, 
        long_rolling, 
        label = '8 SMA')
ax.plot(long_rolling.index, 
        ema_long, 
        label = 'Span 8 EMA')

ax.legend(loc='best')
ax.set_ylabel('id total')

yhat = pd.DataFrame(dict(actual=test))

In [None]:
span = 7
ema_long = train.ewm(span=span, adjust=False).mean()
midband = ema_long[-1]
ub = midband + ema_long[-24:-1].std()*3
lb = midband - ema_long[-24:-1].std()*3

yhat['moving_avg_forecast'] = midband

In [None]:
# compute the absolute error:
yhat['error'] = abs(yhat.actual - yhat.moving_avg_forecast)

# compute the mean of the absolute error:
# yhat.error.median()

# compute upper band and lower band using IQR with weight of 3

yhat['pct_b'] = (yhat.actual-lb)/(ub-lb)

In [None]:
yhat[yhat.pct_b > 1]


In [None]:
plot_and_eval(yhat.moving_avg_forecast, actual=test)
plt.figure(figsize=(12,4))
plt.plot(yhat.pct_b)


## Clustering - DBSCAN


In [None]:
df = pd.read_csv('customers.csv')

In [None]:
df.head()

In [None]:
ff = df[['Fresh', 'Frozen']]

In [None]:
np_array = ff.values.astype("float32", copy = False)
np_array[0:10]

In [None]:
stscaler = StandardScaler().fit(np_array)
np_array = stscaler.transform(np_array)
np_array[0:10]


In [None]:
dbsc = DBSCAN(eps = .75, min_samples = 15).fit(np_array)


In [None]:
labels = dbsc.labels_
labels[0:10]


In [None]:
ff['labels'] = labels
ff.labels.value_counts()


In [None]:
ff[ff.labels==-1].head()


In [None]:
sns.scatterplot(ff.Fresh, ff.Frozen, hue=ff.labels, palette='Paired')
plt.show()

In [None]:
zil_df = pd.read_csv('zillow_data_cleaned.csv')

In [None]:
zil_df.head()

In [None]:
ff = zil_df[['bedroomcnt', 'calculatedfinishedsquarefeet']]

In [None]:
ff.dropna(inplace=True)

In [None]:
np_array = ff.values.astype("float32", copy = False)

stscaler = StandardScaler().fit(np_array)
np_array = stscaler.transform(np_array)



In [None]:
dbsc = DBSCAN(eps = .75, min_samples = 1000).fit(np_array)


In [None]:
labels = dbsc.labels_
labels[0:10]

In [None]:



dbsc = DBSCAN(eps = .75, min_samples = 15).fit(np_array)


labels = dbsc.labels_
labels[0:10]


ff['labels'] = labels
ff.labels.value_counts()


ff[ff.labels==-1].head()


sns.scatterplot(ff.bedroomcnt, ff.calculatedfinishedsquarefeet, hue=ff.labels, palette='Paired')
plt.show()