## Pumpkin Varieties and Color

Load up required libraries and dataset. Convert the data to a dataframe containing a subset of the data: 

Let's look at the relationship between color and variety

In [None]:
import pandas as pd
import numpy as np

full_pumpkins = pd.read_csv('../data/US-pumpkins.csv')

full_pumpkins.head()


In [28]:
columns_to_select = ['City Name', 'Package', 'Variety', 'Origin', 'Item Size', 'Color']
pumpkins = full_pumpkins.loc[:,columns_to_select]

pumpkins.dropna(inplace=True)

In [None]:
import seaborn as sns

palette = {
    'ORANGE': 'orange',
    'WHITE': 'wheat'
}

sns.catplot(
    data=pumpkins, y='Variety', hue='Color', kind='count',
    palette=palette
)

In [30]:
from sklearn.preprocessing import OrdinalEncoder

item_size_categories = [['sml', 'med', 'med-lge', 'lge', 'xlge', 'jbo', 'exjbo']]
ordinal_features = ['Item Size']
ordinal_encoder = OrdinalEncoder(categories=item_size_categories)

In [31]:
from sklearn.preprocessing import OneHotEncoder

categorical_features = ['City Name', 'Package', 'Variety', 'Origin']
categorical_encoder = OneHotEncoder(sparse_output=False)

In [32]:
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(transformers=[
    ('ord', ordinal_encoder, ordinal_features),
    ('cat', categorical_encoder, categorical_features)
], remainder="drop")

ct.set_output(transform='pandas')
encoded_features = ct.fit_transform(pumpkins)

In [33]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded_label = label_encoder.fit_transform(pumpkins['Color'])

In [34]:
encoded_pumpkins = encoded_features.assign(Color=encoded_label)

In [None]:
palatte = {
    'ORANGE': 'orange',
    'WHITE': 'wheat'
}
pumpkins['Item Size'] = encoded_pumpkins['ord__Item Size']

g = sns.catplot(
    data=pumpkins,
    x='Item Size', y='Color', row='Variety',
    kind='box', orient='h',
    sharex=False, margin_titles=True,
    height=1.8, aspect=4, palette=palette
)
g.set(xlabel='Item Size', ylabel='').set(xlim=(0,6))
g.set_titles(row_template="{row_name}")

In [36]:
from sklearn.model_selection import train_test_split

X = encoded_pumpkins[encoded_pumpkins.columns.difference(['Color'])]
y = encoded_pumpkins['Color']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
X.shape[1]

In [None]:
from sklearn.metrics import f1_score, classification_report
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train,y_train)
predictions = model.predict(X_test)

print(classification_report(y_test,predictions))
print('Predicted labels: ', predictions)
print('F1_score: ', f1_score(y_test,predictions))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,predictions)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

y_scores = model.predict_proba(X_test)
fpr, tpr, thresolds = roc_curve(y_test, y_scores[:,1])

fig = plt.figure(figsize=(6,6))
plt.plot([0,1], [0,1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.show()

In [None]:
auc = roc_auc_score(y_test, y_scores[:,1])
print(auc)

In [42]:
import pickle

model_filename = 'pumpkins-model.pkl'
pickle.dump(model, open(model_filename, 'wb'))

In [48]:
with open("pumpkins-preprocessor.pkl", "wb") as f:
    pickle.dump(ct, f)
    
with open("pumpkins-label-encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)