# Beer Consumption Prediction

In [65]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score

In [66]:
df=pd.read_csv("consumo_cerveja.csv")
df

Unnamed: 0,Data,Temperatura Media (C),Temperatura Minima (C),Temperatura Maxima (C),Precipitacao (mm),Final de Semana,Consumo de cerveja (litros)
0,2015-01-01,273,239,325,0,0.0,25.461
1,2015-01-02,2702,245,335,0,0.0,28.972
2,2015-01-03,2482,224,299,0,1.0,30.814
3,2015-01-04,2398,215,286,12,1.0,29.799
4,2015-01-05,2382,21,283,0,0.0,28.900
...,...,...,...,...,...,...,...
936,,,,,,,
937,,,,,,,
938,,,,,,,
939,,,,,,,


In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 941 entries, 0 to 940
Data columns (total 7 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Data                         365 non-null    object 
 1   Temperatura Media (C)        365 non-null    object 
 2   Temperatura Minima (C)       365 non-null    object 
 3   Temperatura Maxima (C)       365 non-null    object 
 4   Precipitacao (mm)            365 non-null    object 
 5   Final de Semana              365 non-null    float64
 6   Consumo de cerveja (litros)  365 non-null    float64
dtypes: float64(2), object(5)
memory usage: 51.6+ KB


In [68]:
df.describe()

Unnamed: 0,Final de Semana,Consumo de cerveja (litros)
count,365.0,365.0
mean,0.284932,25.401367
std,0.452001,4.399143
min,0.0,14.343
25%,0.0,22.008
50%,0.0,24.867
75%,1.0,28.631
max,1.0,37.937


In [69]:
print(df.isnull().sum())

Data                           576
Temperatura Media (C)          576
Temperatura Minima (C)         576
Temperatura Maxima (C)         576
Precipitacao (mm)              576
Final de Semana                576
Consumo de cerveja (litros)    576
dtype: int64


In [70]:
print(df.isnull().sum().sum())

4032


In [71]:
def preprocessing (df):
    df=df.copy()

In [72]:
df=df.dropna(axis=0).reset_index(drop=True)
df

Unnamed: 0,Data,Temperatura Media (C),Temperatura Minima (C),Temperatura Maxima (C),Precipitacao (mm),Final de Semana,Consumo de cerveja (litros)
0,2015-01-01,273,239,325,0,0.0,25.461
1,2015-01-02,2702,245,335,0,0.0,28.972
2,2015-01-03,2482,224,299,0,1.0,30.814
3,2015-01-04,2398,215,286,12,1.0,29.799
4,2015-01-05,2382,21,283,0,0.0,28.900
...,...,...,...,...,...,...,...
360,2015-12-27,24,211,282,136,1.0,32.307
361,2015-12-28,2264,211,267,0,0.0,26.095
362,2015-12-29,2168,203,241,103,0.0,22.309
363,2015-12-30,2138,193,224,63,0.0,20.467


In [73]:
import re
for column in ['Temperatura Media (C)','Temperatura Minima (C)','Temperatura Maxima (C)','Precipitacao (mm)']:
    df[column]=df[column].apply(lambda x: np.float(re.sub(r',','.' ,x)))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  This is separate from the ipykernel package so we can avoid doing imports until


In [74]:
print(df)

           Data  Temperatura Media (C)  Temperatura Minima (C)  \
0    2015-01-01                  27.30                    23.9   
1    2015-01-02                  27.02                    24.5   
2    2015-01-03                  24.82                    22.4   
3    2015-01-04                  23.98                    21.5   
4    2015-01-05                  23.82                    21.0   
..          ...                    ...                     ...   
360  2015-12-27                  24.00                    21.1   
361  2015-12-28                  22.64                    21.1   
362  2015-12-29                  21.68                    20.3   
363  2015-12-30                  21.38                    19.3   
364  2015-12-31                  24.76                    20.2   

     Temperatura Maxima (C)  Precipitacao (mm)  Final de Semana  \
0                      32.5                0.0              0.0   
1                      33.5                0.0              0.0   
2     

In [75]:
df["Data"]=pd.to_datetime(df["Data"])
df["Year"]=df["Data"].apply(lambda x: x.year)
df["Month"]=df["Data"].apply(lambda x: x.month)
df["Day"]=df["Data"].apply(lambda x: x.day)
df=df.drop("Data", axis=1)

In [76]:
y=df["Consumo de cerveja (litros)"].copy()
X=df.drop("Consumo de cerveja (litros)",axis=1).copy()
X

Unnamed: 0,Temperatura Media (C),Temperatura Minima (C),Temperatura Maxima (C),Precipitacao (mm),Final de Semana,Year,Month,Day
0,27.30,23.9,32.5,0.0,0.0,2015,1,1
1,27.02,24.5,33.5,0.0,0.0,2015,1,2
2,24.82,22.4,29.9,0.0,1.0,2015,1,3
3,23.98,21.5,28.6,1.2,1.0,2015,1,4
4,23.82,21.0,28.3,0.0,0.0,2015,1,5
...,...,...,...,...,...,...,...,...
360,24.00,21.1,28.2,13.6,1.0,2015,12,27
361,22.64,21.1,26.7,0.0,0.0,2015,12,28
362,21.68,20.3,24.1,10.3,0.0,2015,12,29
363,21.38,19.3,22.4,6.3,0.0,2015,12,30


In [77]:
scaler=StandardScaler()
X=pd.DataFrame(scaler.fit_transform(X),columns=X.columns)

In [79]:
X

Unnamed: 0,Temperatura Media (C),Temperatura Minima (C),Temperatura Maxima (C),Precipitacao (mm),Final de Semana,Year,Month,Day
0,1.912508,2.281333,1.365781,-0.419062,-0.631243,0.0,-1.602745,-1.673503
1,1.824340,2.493924,1.597722,-0.419062,-0.631243,0.0,-1.602745,-1.559818
2,1.131590,1.749853,0.762735,-0.419062,1.584177,0.0,-1.602745,-1.446134
3,0.867085,1.430966,0.461212,-0.322294,1.584177,0.0,-1.602745,-1.332449
4,0.816703,1.253806,0.391630,-0.419062,-0.631243,0.0,-1.602745,-1.218764
...,...,...,...,...,...,...,...,...
360,0.873383,1.289238,0.368436,0.677640,1.584177,0.0,1.587648,1.282303
361,0.445137,1.289238,0.020525,-0.419062,-0.631243,0.0,1.587648,1.395988
362,0.142846,1.005782,-0.582521,0.411528,-0.631243,0.0,1.587648,1.509672
363,0.048380,0.651463,-0.976820,0.088969,-0.631243,0.0,1.587648,1.623357


In [80]:
y

0      25.461
1      28.972
2      30.814
3      29.799
4      28.900
        ...  
360    32.307
361    26.095
362    22.309
363    20.467
364    22.446
Name: Consumo de cerveja (litros), Length: 365, dtype: float64

In [81]:
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7,shuffle=True,random_state=123)

In [82]:
model_1=LinearRegression()
model_1.fit(X_train,y_train)
model_1.score(X_test,y_test)
model_1.score(X_test,y_test)*100

68.95609397811931

In [83]:
model_2=Ridge()
model_2.fit(X_train,y_train)
model_2.score(X_test,y_test)
model_2.score(X_test,y_test)*100

69.08518028719098

In [84]:
model_3=Lasso()
model_3.fit(X_train,y_train)
model_3.score(X_test,y_test)
model_3.score(X_test,y_test)*100

57.62968151157226

In [85]:
model_4=ElasticNet()
model_4.fit(X_train,y_train)
model_4.score(X_test,y_test)
model_4.score(X_test,y_test)*100

56.46054607362464