In [42]:
from datetime import datetime
import time
import itertools
from hmmlearn import hmm
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [43]:
# prediction params
HIDDEN_STATES = 4
latency = 10

# generate all possible descrete outcomes
frac_price_range = np.linspace(-5e-7, 5e-7, 40)
frac_volume_range = np.linspace(-40, 40, 40)

# plot settings
plt.style.use('ggplot')

possible_outcomes = np.array(list(itertools.product(
    frac_price_range, frac_volume_range)))

In [57]:
def prepare_data():
    df = pd.read_csv('avocado.csv')
    data = df[(df.region == 'TotalUS') & (df.type == 'conventional')]\
            [['Date', 'AveragePrice', 'Total Volume']]
    data['Date'] = pd.to_datetime(data['Date'])
    data = data.sort_values(by=['Date'])
    train_data, test_data = train_test_split(
        data, test_size=0.25, shuffle=False)

    return (train_data, test_data)

def date_to_timestamp(datestr):
    return time.mktime(datetime.strptime(datestr, '%Y-%m-%d').timetuple())

def extract_features(data):
    average_price = np.array(data['AveragePrice'])
    date = np.array(data['Date'])
    volume = np.array(data['Total Volume'])

    frac_price = []
    frac_volume = []
    average_price_past = average_price[0]
    date_past = date[0]
    volume_past = volume[0]

    for (ap, d, v) in zip(average_price[1:], date[1:], volume[1:]):
        time_elapsed = date_to_timestamp(d) - date_to_timestamp(date_past)
        frac_price.append((ap - average_price_past) / time_elapsed)
        frac_volume.append((v - volume_past) / time_elapsed)

        average_price_past = ap
        date_past = d
        volume_past = v

    print(max(frac_price))
    print(min(frac_price))
    print(max(frac_volume))
    print(min(frac_volume))
    return np.column_stack((frac_price, frac_volume)) 

def predict_frac_price(model, prev_features):
    # measure the score for each possible outcome
    outcome_score = []
    for po in possible_outcomes:
        data = np.row_stack((prev_features, po))
        outcome_score.append(model.score(data))

    predicted_frac_price, _ = possible_outcomes[np.argmax(outcome_score)]
    predicted_price = predicted_frac_price
    return predicted_price

def predict_avocado():
    (train_data, test_data) = prepare_data()
    train_features = extract_features(train_data)
    test_features = extract_features(test_data)
    test_prices = np.array(test_data['AveragePrice'])
    test_dates = np.array(test_data['Date'])

    model = hmm.GaussianHMM(n_components=HIDDEN_STATES)

    # fit
    model.fit(train_features)

    # start price prediction
    predicted_prices = []
    # prediction_len = len(test_features) - latency
    prediction_len = 20
    for i in range(0, prediction_len):
        prev_features = test_features[i:i+latency]
        prev_price = test_prices[i+latency]

        # get time delta between two dates
        prev_date = test_dates[i+latency]
        curr_date = test_dates[i+latency+1]
        time_elapsed = date_to_timestamp(curr_date) - date_to_timestamp(prev_date) 

        print('[Prediction] {} of {}'.format(i, prediction_len))
        predicted_frac_price = predict_frac_price(model, prev_features)
        predicted_price = prev_price + predicted_frac_price * time_elapsed
        predicted_prices.append(predicted_price)

    # save prediction result
    real_prices = test_prices[latency:prediction_len + latency]
    dates = test_dates[latency:prediction_len + latency]
    prediction_result = {
            'date': dates,
            'predicted price': predicted_prices, 
            'real price': real_prices}
    pd.DataFrame(predicted_prices).to_csv('prediction.csv')

    # plot

    fig = plt.figure()
    axes = fig.add_subplot(111)
    axes.set_title('Avocado price prediction')
    axes.plot(dates, real_prices, "bo-", label="real")
    axes.plot(dates, predicted_prices, "go-", label="predicted")

    plt.legend()
    plt.show()

In [64]:
df = pd.read_csv('avocado.csv')
data = df[(df.region == 'TotalUS') & (df.type == 'conventional')]\
    [['Date', 'AveragePrice', 'Total Volume']]
data['Date'] = pd.to_datetime(data['Date'])
data = data.sort_values(by=['Date'])
data.to_csv('US_total.csv')

In [49]:
data['Date'] = pd.to_datetime(data['Date'])
data

Unnamed: 0.1,Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.00,conventional,2015,Albany
1,1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.00,conventional,2015,Albany
2,2,2015-12-13,0.93,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.00,conventional,2015,Albany
3,3,2015-12-06,1.08,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.00,conventional,2015,Albany
4,4,2015-11-29,1.28,51039.60,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.00,conventional,2015,Albany
5,5,2015-11-22,1.26,55979.78,1184.27,48067.99,43.61,6683.91,6556.47,127.44,0.00,conventional,2015,Albany
6,6,2015-11-15,0.99,83453.76,1368.92,73672.72,93.26,8318.86,8196.81,122.05,0.00,conventional,2015,Albany
7,7,2015-11-08,0.98,109428.33,703.75,101815.36,80.00,6829.22,6266.85,562.37,0.00,conventional,2015,Albany
8,8,2015-11-01,1.02,99811.42,1022.15,87315.57,85.34,11388.36,11104.53,283.83,0.00,conventional,2015,Albany
9,9,2015-10-25,1.07,74338.76,842.40,64757.44,113.00,8625.92,8061.47,564.45,0.00,conventional,2015,Albany


Unnamed: 0,Date,AveragePrice,Total Volume,type
11778,2015-12-27,1.52,549787.59,organic
11779,2015-12-20,1.53,531478.24,organic
11780,2015-12-13,1.43,624300.31,organic
11781,2015-12-06,1.52,514112.96,organic
11782,2015-11-29,1.50,507830.81,organic
11783,2015-11-22,1.49,584276.79,organic
11784,2015-11-15,1.60,511347.26,organic
11785,2015-11-08,1.54,605197.12,organic
11786,2015-11-01,1.47,647789.17,organic
11787,2015-10-25,1.62,560830.54,organic


In [22]:
data.AveragePrice

0        1.33
1        1.35
2        0.93
3        1.08
4        1.28
5        1.26
6        0.99
7        0.98
8        1.02
9        1.07
10       1.12
11       1.28
12       1.31
13       0.99
14       1.33
15       1.28
16       1.11
17       1.07
18       1.34
19       1.33
20       1.12
21       1.45
22       1.11
23       1.26
24       1.05
25       1.35
26       1.37
27       1.27
28       1.32
29       1.07
         ... 
18219    1.56
18220    1.53
18221    1.61
18222    1.63
18223    1.59
18224    1.51
18225    1.60
18226    1.73
18227    1.63
18228    1.46
18229    1.49
18230    1.64
18231    1.47
18232    1.41
18233    1.80
18234    1.83
18235    1.82
18236    1.48
18237    1.62
18238    1.56
18239    1.56
18240    1.54
18241    1.57
18242    1.56
18243    1.57
18244    1.63
18245    1.71
18246    1.87
18247    1.93
18248    1.62
Name: AveragePrice, Length: 18249, dtype: float64

In [41]:
import numpy as np
import itertools

frac_change_range = np.linspace(-0.1, 0.1, 20)
frac_high_range = np.linspace(0, 0.1, 20)

list(itertools.product(
    frac_change_range, frac_high_range))

[(-0.1, 0.0),
 (-0.1, 0.005263157894736842),
 (-0.1, 0.010526315789473684),
 (-0.1, 0.015789473684210527),
 (-0.1, 0.021052631578947368),
 (-0.1, 0.02631578947368421),
 (-0.1, 0.031578947368421054),
 (-0.1, 0.03684210526315789),
 (-0.1, 0.042105263157894736),
 (-0.1, 0.04736842105263158),
 (-0.1, 0.05263157894736842),
 (-0.1, 0.05789473684210526),
 (-0.1, 0.06315789473684211),
 (-0.1, 0.06842105263157895),
 (-0.1, 0.07368421052631578),
 (-0.1, 0.07894736842105263),
 (-0.1, 0.08421052631578947),
 (-0.1, 0.08947368421052632),
 (-0.1, 0.09473684210526316),
 (-0.1, 0.1),
 (-0.08947368421052632, 0.0),
 (-0.08947368421052632, 0.005263157894736842),
 (-0.08947368421052632, 0.010526315789473684),
 (-0.08947368421052632, 0.015789473684210527),
 (-0.08947368421052632, 0.021052631578947368),
 (-0.08947368421052632, 0.02631578947368421),
 (-0.08947368421052632, 0.031578947368421054),
 (-0.08947368421052632, 0.03684210526315789),
 (-0.08947368421052632, 0.042105263157894736),
 (-0.08947368421052632