In [167]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score

# Load the pumpkins CSV file into a pandas DataFrame
# !!import the right pumkin csv
pumpkins = pd.read_csv('2-Regression/data/US-pumpkins.csv')
print(pumpkins.head())


   City Name Type       Package      Variety Sub Variety  Grade     Date  \
0  BALTIMORE  NaN  24 inch bins          NaN         NaN    NaN  4/29/17   
1  BALTIMORE  NaN  24 inch bins          NaN         NaN    NaN   5/6/17   
2  BALTIMORE  NaN  24 inch bins  HOWDEN TYPE         NaN    NaN  9/24/16   
3  BALTIMORE  NaN  24 inch bins  HOWDEN TYPE         NaN    NaN  9/24/16   
4  BALTIMORE  NaN  24 inch bins  HOWDEN TYPE         NaN    NaN  11/5/16   

   Low Price  High Price  Mostly Low  ...  Unit of Sale Quality Condition  \
0      270.0       280.0       270.0  ...           NaN     NaN       NaN   
1      270.0       280.0       270.0  ...           NaN     NaN       NaN   
2      160.0       160.0       160.0  ...           NaN     NaN       NaN   
3      160.0       160.0       160.0  ...           NaN     NaN       NaN   
4       90.0       100.0        90.0  ...           NaN     NaN       NaN   

  Appearance Storage  Crop Repack  Trans Mode  Unnamed: 24  Unnamed: 25  
0     

In [168]:
print(pumpkins.isnull().sum())

City Name             0
Type               1712
Package               0
Variety               5
Sub Variety        1461
Grade              1757
Date                  0
Low Price             0
High Price            0
Mostly Low          103
Mostly High         103
Origin                3
Origin District    1626
Item Size           279
Color               616
Environment        1757
Unit of Sale       1595
Quality            1757
Condition          1757
Appearance         1757
Storage            1757
Crop               1757
Repack                0
Trans Mode         1757
Unnamed: 24        1757
Unnamed: 25        1654
dtype: int64


In [169]:
# pumpkins['Year'] = pumpkins['Date'].dt.year
# pumpkins['Date'] = pd.to_datetime(pumpkins['Date'])
# pumpkins['Day'] = pumpkins['Date'].dt.day
# pumpkins['Month'] = pumpkins['Date'].dt.month
print(pumpkins.head())

   City Name Type       Package      Variety Sub Variety  Grade     Date  \
0  BALTIMORE  NaN  24 inch bins          NaN         NaN    NaN  4/29/17   
1  BALTIMORE  NaN  24 inch bins          NaN         NaN    NaN   5/6/17   
2  BALTIMORE  NaN  24 inch bins  HOWDEN TYPE         NaN    NaN  9/24/16   
3  BALTIMORE  NaN  24 inch bins  HOWDEN TYPE         NaN    NaN  9/24/16   
4  BALTIMORE  NaN  24 inch bins  HOWDEN TYPE         NaN    NaN  11/5/16   

   Low Price  High Price  Mostly Low  ...  Unit of Sale Quality Condition  \
0      270.0       280.0       270.0  ...           NaN     NaN       NaN   
1      270.0       280.0       270.0  ...           NaN     NaN       NaN   
2      160.0       160.0       160.0  ...           NaN     NaN       NaN   
3      160.0       160.0       160.0  ...           NaN     NaN       NaN   
4       90.0       100.0        90.0  ...           NaN     NaN       NaN   

  Appearance Storage  Crop Repack  Trans Mode  Unnamed: 24  Unnamed: 25  
0     

In [170]:
pumpkins.drop(['Type','Origin','Environment', 'Origin District', 'Item Size', 'Color','City Name', 'Sub Variety', 'Grade', 'Date', 'Unit of Sale', 'Quality', 'Condition', 'Appearance', 'Storage', 'Crop', 'Repack', 'Trans Mode', 'Unnamed: 24', 'Unnamed: 25'], axis=1, inplace=True)
print(pumpkins.head())

        Package      Variety  Low Price  High Price  Mostly Low  Mostly High
0  24 inch bins          NaN      270.0       280.0       270.0        280.0
1  24 inch bins          NaN      270.0       280.0       270.0        280.0
2  24 inch bins  HOWDEN TYPE      160.0       160.0       160.0        160.0
3  24 inch bins  HOWDEN TYPE      160.0       160.0       160.0        160.0
4  24 inch bins  HOWDEN TYPE       90.0       100.0        90.0        100.0


In [171]:
# good for time series data
pumpkins.interpolate(inplace=True)


   Package  Variety  Low Price  High Price  Mostly Low  Mostly High
0        5       -1      270.0       280.0       270.0        280.0
1        5       -1      270.0       280.0       270.0        280.0
2        5        4      160.0       160.0       160.0        160.0
3        5        4      160.0       160.0       160.0        160.0
4        5        4       90.0       100.0        90.0        100.0


In [None]:
# clean and standardize the data
# pumpkins.dropna(inplace=True)
pumpkins['Variety'] = pd.Categorical(pumpkins['Variety'])
pumpkins['Variety'] = pumpkins['Variety'].cat.codes
pumpkins['Package'] = pd.Categorical(pumpkins['Package'])
pumpkins['Package'] = pumpkins['Package'].cat.codes

print(pumpkins.head())


In [172]:
print(pumpkins.dtypes)
print(pumpkins.columns)


Package           int8
Variety           int8
Low Price      float64
High Price     float64
Mostly Low     float64
Mostly High    float64
dtype: object
Index(['Package', 'Variety', 'Low Price', 'High Price', 'Mostly Low',
       'Mostly High'],
      dtype='object')


In [173]:
# pumpkins.dropna(inplace=True)
# pumpkins['Package'] = pd.to_numeric(pumpkins['Package'], errors='coerce')
# print(np.isfinite(pumpkins).sum())
print(np.isnan(pumpkins).sum())

pumpkins.replace([np.inf, -np.inf], np.nan, inplace=True)
pumpkins.fillna(pumpkins.mean(), inplace=True)
print(pumpkins.head())


Package        0
Variety        0
Low Price      0
High Price     0
Mostly Low     0
Mostly High    0
dtype: int64
   Package  Variety  Low Price  High Price  Mostly Low  Mostly High
0        5       -1      270.0       280.0       270.0        280.0
1        5       -1      270.0       280.0       270.0        280.0
2        5        4      160.0       160.0       160.0        160.0
3        5        4      160.0       160.0       160.0        160.0
4        5        4       90.0       100.0        90.0        100.0


In [174]:

print(pumpkins.shape)
# rows, columns (1757, 6)


(1757, 6)


In [175]:

scaler = StandardScaler()
pumpkins = pd.DataFrame(scaler.fit_transform(pumpkins), columns=pumpkins.columns)


In [176]:

# Split the data into training and testing sets
X = pumpkins.drop('Variety', axis=1)
y = pumpkins['Variety']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.dtypes)

Package        float64
Low Price      float64
High Price     float64
Mostly Low     float64
Mostly High    float64
dtype: object


In [181]:
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
# fill missing values with mean column values
X_train.fillna(X_train.mean(), inplace=True)
print(np.isfinite(X_train).sum())
print(np.isnan(X_train).sum())
print(X_train.shape)
# check balance of classes
print(X_train.value_counts())

Package        1405
Low Price      1405
High Price     1405
Mostly Low     1405
Mostly High    1405
dtype: int64
Package        0
Low Price      0
High Price     0
Mostly Low     0
Mostly High    0
dtype: int64
(1405, 5)
Package    Low Price  High Price  Mostly Low  Mostly High
-0.154668   0.899537   0.748932    0.891396    0.825557      58
 0.579797   0.303254   0.190270    0.323366    0.269402      40
-0.154668   1.615078   1.698659    1.857047    1.771020      35
            0.303254   0.190270    0.323366    0.269402      31
 0.579797   0.422511   0.302002    0.436972    0.380633      31
                                                             ..
            0.541767   0.972397    0.891396    0.825557       1
 0.212564  -1.109938  -1.128174   -1.022865   -1.043124       1
           -1.103975  -1.128174   -1.017185   -1.043124       1
 0.579797   0.541767   0.525467    0.550578    0.603095       1
           -0.293030  -0.144928   -0.244664   -0.064291       1
Name: count, Leng

In [180]:
print(np.isfinite(y_train).sum())
print(np.isnan(y_train).sum())
print(y_train.shape)
print(y_train.value_counts())

1405
0
(1405,)
Variety
-0.618782    431
 1.246054    370
 0.500120    249
-0.991749    105
-1.364716     62
-2.110651     61
 0.873087     52
-0.245814     40
 0.127153     17
-1.737683     13
-2.483618      5
Name: count, dtype: int64


In [182]:

# Create a Logistic Regression model
# C parameter controls the amount of regularization strength of model, 
# can use instead different models, decision tree, random forest, etc.
log_reg = LogisticRegression(C=0.1, solver='lbfgs', max_iter=1000)

# Train the model on the training set
log_reg.fit(X_train, y_train)
# ValueError 5 log_reg.fit(X_train, y_train)


ValueError: Unknown label type: 'continuous'

In [None]:

# make predictions on the testing set
y_pred = log_reg.predict(X_test)

# evaluate the performance of the model using a confusion matrix and ROC curve
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

y_pred_prob = log_reg.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

auc_score = roc_auc_score(y_test, y_pred_prob)
print('AUC Score:', auc_score)
