In [332]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

pumpkins = pd.read_csv('US-pumpkins.csv')
pumpkins.head()
pumpkins = pumpkins[pumpkins['Package'].str.contains('bushel', case=True, regex=True)]

new_columns = ['Package', 'Variety', 'City Name', 'Month', 'Low Price', 'High Price', 'Date']
pumpkins = pumpkins.drop([c for c in pumpkins.columns if c not in new_columns], axis=1)

price = (pumpkins['Low Price'] + pumpkins['High Price']) / 2

month = pd.DatetimeIndex(pumpkins['Date']).month
day_of_year = pd.to_datetime(pumpkins['Date']).apply(lambda dt: (dt-datetime(dt.year,1,1)).days)
new_pumpkins = pumpkins.assign(DayOfYear=day_of_year, Month=month, Price=price)

X = pd.get_dummies(new_pumpkins['Variety']) \
        .join(new_pumpkins['Month']) \
        .join(pd.get_dummies(new_pumpkins['City Name'])) \
        .join(pd.get_dummies(new_pumpkins['Package']))
y = new_pumpkins['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

pipeline = make_pipeline(PolynomialFeatures(2), LinearRegression())
pipeline.fit(X_train,y_train)

pred = pipeline.predict(X_test)
print(X_test.keys)
mse = np.sqrt(mean_squared_error(y_test,pred))
print(f'Mean error: {mse:3.3} ({mse/np.mean(pred)*100:3.3}%)')

score = pipeline.score(X_train,y_train)
print('Model determination: ', score)


<bound method NDFrame.keys of       FAIRYTALE  MINIATURE  MIXED HEIRLOOM VARIETIES  PIE TYPE  Month  \
1571      False       True                     False     False     10   
719       False      False                     False      True     12   
979       False      False                     False      True     10   
1517      False      False                     False      True     11   
1235      False       True                     False     False     10   
...         ...        ...                       ...       ...    ...   
1237      False       True                     False     False      9   
544       False       True                     False     False     10   
77        False      False                     False      True     10   
92        False      False                     False      True      9   
380       False      False                     False      True     10   

      ATLANTA  BALTIMORE  BOSTON  CHICAGO  COLUMBIA  DETROIT  NEW YORK  \
1571    False      

  day_of_year = pd.to_datetime(pumpkins['Date']).apply(lambda dt: (dt-datetime(dt.year,1,1)).days)
