In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [2]:
df = pd.read_csv('day.csv')

In [3]:
df.head()

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,cnt
0,1,0,1,0,6,0,985
1,1,0,1,0,0,0,801
2,1,0,1,0,1,1,1349
3,1,0,1,0,2,1,1562
4,1,0,1,0,3,1,1600


- season : estações (1:primavera, 2:verão, 3:outono, 4:inverno)
- yr : ano (0: 2011, 1:2012)
- mnth : mês ( 1 to 12)
- hr : hora (0 to 23)
- holiday : feriado
- weekday : dia da semana
- workingday : dia útil

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 7 columns):
season        731 non-null int64
yr            731 non-null int64
mnth          731 non-null int64
holiday       731 non-null int64
weekday       731 non-null int64
workingday    731 non-null int64
cnt           731 non-null int64
dtypes: int64(7)
memory usage: 40.1 KB


In [5]:
#separando os dados em 4 dataframes diferentes(X: variáveis independentes, y: variável resposta)
#com 30% dos dados sendo usados para teste.
X_treino, X_teste, y_treino, y_teste = train_test_split(df.drop(columns='cnt'), df['cnt'], test_size=0.3)

In [6]:
lr = LinearRegression() #Criando a regressão
lr.fit(X_treino, y_treino) #Ajustando/treinando o modelo

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [7]:
predito = lr.predict(X_teste)

In [8]:
predito[:5]

array([3393.12112875, 4003.19484598, 2350.49830258, 5629.3835197 ,
       4621.81910295])

In [9]:
X_treino.head()

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday
490,2,1,5,0,6,0
119,2,0,4,0,6,0
607,3,1,8,0,4,1
513,2,1,5,1,1,0
208,3,0,7,0,4,1


In [10]:
print('R²: ', lr.score(X_teste, y_teste))

R²:  0.4344559645820627


In [11]:
print('MSE: ', mean_squared_error(y_teste, predito))

MSE:  2256480.014992976


In [12]:
print('MAE: ', mean_absolute_error(y_teste, predito))

MAE:  1200.2765698301357


In [13]:
lr.coef_ #coeficientes de cada variável independente

array([1034.07228643, 2236.26239095, -126.34451679, -830.6214775 ,
         44.96501884,  130.10773756])

In [14]:
for i,coef in enumerate(lr.coef_):
    print(df.columns[i],' : ', coef)

season  :  1034.0722864272173
yr  :  2236.262390950053
mnth  :  -126.34451678601499
holiday  :  -830.6214775028453
weekday  :  44.96501884433093
workingday  :  130.10773755993293


In [15]:
lr.intercept_

1520.3868101026405