# Beer Consumption in College

In [381]:
import pandas as pd
import pandas_profiling
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matrixprofile import *

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import (
                                     MinMaxScaler, 
                                     StandardScaler, 
                                     OneHotEncoder, 
                                     LabelEncoder
                                  )

In [382]:
sns.set_palette('cubehelix')

In [383]:
new_cols = ['date', 'avg_temp', 'min_temp', 'max_temp', 'precipitation_mm', 'weekend', 'total_consumption']
beer_df = pd.read_csv('consumo_cerveja.csv', skiprows=1, names=new_cols, index_col=['date'], parse_dates=['date'])

In [385]:
profile = beer_df.profile_report(title='Beer Profiling Report')
profile.to_file(output_file="init_beer_profile_report.html")

FileNotFoundError: [Errno 2] No such file or directory: 'reports\\init_beer_profile_report.html'

In [None]:
beer_df['day_of_year'] = beer_df.index.dayofyear
beer_df['day'] = beer_df.index.day
beer_df['week_day'] = beer_df.index.weekday
beer_df['month'] = beer_df.index.month
beer_df['year'] = beer_df.index.year

In [None]:
beer_df.head()

In [None]:
beer_df.tail()

In [None]:
beer_df.info()

In [None]:
beer_df = beer_df.dropna(how='all')
cols_to_float = ['avg_temp', 'min_temp', 'max_temp', 'precipitation_mm']
beer_df[cols_to_float] = beer_df[cols_to_float].applymap(lambda x: str(x).replace(',', '.')).astype(float)
beer_df.describe()

In [None]:
beer_df.info()

In [None]:
profile = beer_df.profile_report(title='Beer Profiling Report')
profile.to_file(output_file="reports/cleaned_beer_profile_report.html")

In [None]:
beer_df[['avg_temp']].hist()

In [None]:
beer_df[['min_temp']].hist()

In [None]:
beer_df[['max_temp']].hist()

In [None]:
beer_df[['precipitation_mm']].hist()

In [None]:
avg_consumption_df = beer_df.groupby('weekend')['total_consumption'].mean().reset_index(name='avg_consumption')
total_consumption_df = beer_df.groupby('weekend')['total_consumption'].sum().reset_index(name='sum_consumption')

fig, axes = plt.subplots(1, 2, figsize=(20,10))

sns.barplot('weekend', 'avg_consumption', data=avg_consumption_df, ax=axes[0])
sns.barplot('weekend', 'sum_consumption', data=total_consumption_df, ax=axes[1])

In [None]:
total_cons_df = beer_df.sort_values('total_consumption', ascending=False).head(92)


In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20,10))
sns.barplot(x="month", y="total_consumption", data=beer_df, estimator=sum, ax=ax1)
sns.barplot(x='month', y='total_consumption', hue='weekend', data=beer_df, estimator=sum, ax=ax2)


In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20,10))
sns.barplot(x="week_day", y="total_consumption", data=beer_df, estimator=sum, ax=ax1)
sns.barplot(x="week_day", y="total_consumption", data=beer_df, estimator=np.mean, ax=ax2)

In [None]:
liters_per_mm = sum(beer_df.total_consumption)/sum(beer_df.precipitation_mm)
print("{0:.2f}L for every mm of rain during the year.".format(liters_per_mm))


In [None]:
data = beer_df[['total_consumption']].head(100)
weekends = beer_df[['weekend']].head(100)
data = data.diff().join(weekends)

In [None]:
sample = beer_df.head(100)

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20,10))
sns.barplot(x=list(range(len(sample))), y='total_consumption', hue='weekend', data=sample, ax=ax1)
sns.barplot(x=list(range(len(data))), y='total_consumption', hue='weekend', data=data, ax=ax2)

In [None]:
data = beer_df[['avg_temp','min_temp','max_temp','precipitation_mm', 'weekend']]

In [None]:
sns.pairplot(data, vars=data.columns[:-1], hue='weekend')

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(20,10))
sns.regplot('precipitation_mm', 'total_consumption', beer_df, ax=axes[1,0])
sns.regplot('min_temp', 'total_consumption', beer_df, ax=axes[0,0])
sns.regplot('max_temp', 'total_consumption', beer_df, ax=axes[0,1])
sns.regplot('weekend', 'total_consumption', beer_df, ax=axes[1,1])


In [None]:
weekly_consumption = beer_df.resample('W').sum()
monthly_consumption = beer_df.resample('M').sum()
trimester_consumption = beer_df.resample('3M').sum()

In [None]:
fig, (ax1,ax2,ax3,ax4) =plt.subplots(4, 1, figsize=(20,10), sharex=True)

ax1.set_ylabel('Total daily consumption')
ax1.plot(beer_df['total_consumption'])

ax2.set_ylabel('Total weekly consumption')
ax2.plot(weekly_consumption['total_consumption'])

ax3.set_ylabel('Total monthly consumption')
ax3.plot(monthly_consumption['total_consumption'])

ax4.set_ylabel('Total trimester consumption')
ax4.plot(trimester_consumption['total_consumption'])

In [None]:
fig, (ax0, ax1, ax2, ax3) = plt.subplots(4, 1, figsize=(25,10), sharex=True)

fig.suptitle('Moving averages total consumption', fontsize=16)

ax0.plot(beer_df.total_consumption.rolling(window=7).mean())
ax1.plot(beer_df.total_consumption.rolling(window=14).mean())
ax2.plot(beer_df.total_consumption.rolling(window=30).mean())
ax3.plot(beer_df.total_consumption.rolling(window=90).mean())

ylabels = ['Weekly', 'BiWeekly', 'Monthly', 'Trimester']
ax0.set_ylabel(ylabels[0])
ax1.set_ylabel(ylabels[1])
ax2.set_ylabel(ylabels[2])
ax3.set_ylabel(ylabels[3])

In [None]:
from matrixprofile import matrixProfile
m = 32
mp = matrixProfile.stomp(beer_df['total_consumption'].values,m)
mp_adj = np.append(mp[0],np.zeros(m-1)+np.nan)

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20,10))

ax1.set_ylabel('Total monthly consumption', size=22)
ax1.set_xlabel('Date', size=22)
ax1.plot(beer_df.total_consumption)

ax2.plot(np.arange(len(mp_adj)),mp_adj, label="Matrix Profile", color='red')
ax2.set_ylabel('Matrix Profile', size=22)
ax2.set_xlabel('Sample', size=22)

In [None]:
def high_temp(val: float) -> str:
  if val > 17.9:
    return 'high'
  return 'low'

In [None]:
temps = pd.get_dummies(beer_df.min_temp.apply(high_temp), prefix='min_temp_bucket')
beer_df = beer_df.join(temps)

In [None]:
one_hot = pd.get_dummies(beer_df.weekend, prefix='weekend')
beer_df = beer_df.join(one_hot)

In [None]:
lin_reg_df_min_max = beer_df.copy(deep=True)
lin_reg_df_std = beer_df.copy(deep=True)

In [None]:
cols_to_normalize = ['avg_temp', 'min_temp', 'max_temp', 'precipitation_mm']

In [None]:
min_max_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

min_max_scaler.fit(lin_reg_df_min_max[cols_to_normalize])
lin_reg_df_min_max[cols_to_normalize] = min_max_scaler.transform(lin_reg_df_min_max[cols_to_normalize])

standard_scaler.fit(lin_reg_df_std[cols_to_normalize])
lin_reg_df_std[cols_to_normalize] = standard_scaler.transform(lin_reg_df_std[cols_to_normalize])

In [None]:
lin_reg_df_std.head()

In [None]:
lin_reg_df_min_max.head()

In [None]:
# X, y = lin_reg_df_min_max[['avg_temp', 'max_temp', 'precipitation_mm', 'weekend_0.0', 'weekend_1.0', 'min_temp_bucket_high', 'min_temp_bucket_low']], lin_reg_df_min_max[['total_consumption']]

X, y = lin_reg_df_std[
  [
    'avg_temp', 
    'max_temp', 
    'precipitation_mm', 
    'weekend_0.0', 
    'weekend_1.0', 
    'min_temp_bucket_high', 
    'min_temp_bucket_low']
  ], lin_reg_df_std[['total_consumption']]

In [None]:
reg = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
reg.fit(X_train, y_train)
reg.score(X_test, y_test)


In [None]:
from sklearn.metrics import mean_squared_error
import math

y_predict = reg.predict(X_test)
reg_mse = mean_squared_error(y_predict, y_test)

math.sqrt(reg_mse)

In [None]:
coefs = np.array(reg.coef_).tolist()[0]
pd.DataFrame(list(zip(list(X.columns), coefs)), columns=['features', 'coefs'])

In [None]:
beer_df.columns

In [None]:
data = beer_df[
    [
        'avg_temp', 'max_temp', 'precipitation_mm',
        'total_consumption','min_temp_bucket_high', 'min_temp_bucket_low', 'weekend_0.0','weekend_1.0'
    ]]

In [None]:
corr = data.corr()

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, cmap=cmap, square=True, linewidths=.5)

In [None]:
fig, (ax1) = plt.subplots(figsize=(20,10))
sns.regplot(lin_reg_df_min_max.max_temp, lin_reg_df_min_max.total_consumption, ax=ax1)

In [None]:
from sklearn.cluster import KMeans

data = np.array(list(zip(beer_df.total_consumption, beer_df.max_temp)))


In [None]:
fig, axes = plt.subplots(figsize=(20,10))
wcss = []
for i in range(1,11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(data)
    wcss.append(kmeans.inertia_)

plt.plot(range(1,11), wcss)
plt.title('The elbow method')
plt.xlabel('The number of clusters')
plt.ylabel('WCSS')

In [None]:
kmeans = KMeans(n_clusters=3, random_state=100)
kmeans.fit(data)
y_km = kmeans.fit_predict(data)

In [None]:
fig, axes = plt.subplots(figsize=(20,10))

plt.scatter(data[y_km ==0,0], data[y_km == 0,1], s=50, c='red', marker='o', label='summer')
plt.scatter(data[y_km ==1,0], data[y_km == 1,1], s=50, c='green', marker='v', label='fall, spring?')
plt.scatter(data[y_km ==2,0], data[y_km == 2,1], s=50, c='blue',  marker='+', label='winter')
# plt.scatter(data[y_km ==3,0], data[y_km == 3,1], s=50, c='red',  marker='o', label='summer')

plt.title('Beer consumption by Season')
plt.xlabel('Total Consumption (L)')
plt.ylabel('Temp')
plt.legend()

In [None]:
s = sum(data[y_km==0,0])
f = sum(data[y_km==1,0])
w = sum(data[y_km==2,0])

print(f"{s:.2f}, {f:.2f}, {w:.2f}")

In [None]:
sum(data[y_km==1,1])