In [3]:
# Dependencies
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical
from sklearn import preprocessing
import tensorflow
tensorflow.keras.__version__

'2.4.0'

In [4]:
# Read data
weather = pd.read_csv('Resources/dataLargerSet-Copy1.csv')
weather.head()

Unnamed: 0.1,Unnamed: 0,index,winery,vintage,points,cities,MarchLowTemp,MarchHighTemp,MarchAvgTemp,MarchPrecip,...,JulyAvgTemp,JulyPrecip,JulyCloudiness,JulyHumidity,AugLowTemp,AugHighTemp,AugAvgTemp,AugPrecip,AugCloudiness,AugHumidity
0,0,6,Claiborne & Churchill,2009,88,San Luis Obispo,42.129032,68.419355,58.580645,1.5,...,82.935484,0.0,2.612903,41.612903,56.741935,89.129032,79.903226,0.1,4.419355,44.0
1,1,7,Claiborne & Churchill,2011,87,San Luis Obispo,43.806452,62.16129,55.258065,9.4,...,72.193548,0.1,1.83871,62.0,61.741935,79.16129,73.806452,0.0,2.548387,57.0
2,2,8,Claiborne & Churchill,2013,94,San Luis Obispo,47.354839,68.548387,60.516129,1.2,...,73.129032,0.2,5.322581,61.354839,61.290323,80.0,73.806452,0.0,2.516129,59.806452
3,3,9,Claiborne & Churchill,2014,93,San Luis Obispo,48.612903,68.322581,61.129032,1.8,...,74.677419,0.3,10.032258,63.612903,62.096774,80.290323,73.967742,0.0,2.612903,63.774194
4,4,10,Claiborne & Churchill,2014,91,San Luis Obispo,48.612903,68.322581,61.129032,1.8,...,74.677419,0.3,10.032258,63.612903,62.096774,80.290323,73.967742,0.0,2.612903,63.774194


In [5]:
# Clean data set
weatherClean=weather.drop(["Unnamed: 0", "index"], axis=1)
weatherClean.head()

Unnamed: 0,winery,vintage,points,cities,MarchLowTemp,MarchHighTemp,MarchAvgTemp,MarchPrecip,MarchCloudiness,MarchHumidity,...,JulyAvgTemp,JulyPrecip,JulyCloudiness,JulyHumidity,AugLowTemp,AugHighTemp,AugAvgTemp,AugPrecip,AugCloudiness,AugHumidity
0,Claiborne & Churchill,2009,88,San Luis Obispo,42.129032,68.419355,58.580645,1.5,12.870968,61.387097,...,82.935484,0.0,2.612903,41.612903,56.741935,89.129032,79.903226,0.1,4.419355,44.0
1,Claiborne & Churchill,2011,87,San Luis Obispo,43.806452,62.16129,55.258065,9.4,35.032258,76.903226,...,72.193548,0.1,1.83871,62.0,61.741935,79.16129,73.806452,0.0,2.548387,57.0
2,Claiborne & Churchill,2013,94,San Luis Obispo,47.354839,68.548387,60.516129,1.2,21.387097,61.677419,...,73.129032,0.2,5.322581,61.354839,61.290323,80.0,73.806452,0.0,2.516129,59.806452
3,Claiborne & Churchill,2014,93,San Luis Obispo,48.612903,68.322581,61.129032,1.8,22.290323,64.258065,...,74.677419,0.3,10.032258,63.612903,62.096774,80.290323,73.967742,0.0,2.612903,63.774194
4,Claiborne & Churchill,2014,91,San Luis Obispo,48.612903,68.322581,61.129032,1.8,22.290323,64.258065,...,74.677419,0.3,10.032258,63.612903,62.096774,80.290323,73.967742,0.0,2.612903,63.774194


In [6]:
# Points into bins 'PointsRange'
bins = [0, 80, 85, 90, 95, 100]
names = ['<80','80-84', '85-89', '90-94', '95-100', '100+']

d = dict(enumerate(names, 1))

weatherClean['pointsRange'] = np.vectorize(d.get)(np.digitize(weatherClean['points'], bins))

weatherClean.head()

Unnamed: 0,winery,vintage,points,cities,MarchLowTemp,MarchHighTemp,MarchAvgTemp,MarchPrecip,MarchCloudiness,MarchHumidity,...,JulyPrecip,JulyCloudiness,JulyHumidity,AugLowTemp,AugHighTemp,AugAvgTemp,AugPrecip,AugCloudiness,AugHumidity,pointsRange
0,Claiborne & Churchill,2009,88,San Luis Obispo,42.129032,68.419355,58.580645,1.5,12.870968,61.387097,...,0.0,2.612903,41.612903,56.741935,89.129032,79.903226,0.1,4.419355,44.0,85-89
1,Claiborne & Churchill,2011,87,San Luis Obispo,43.806452,62.16129,55.258065,9.4,35.032258,76.903226,...,0.1,1.83871,62.0,61.741935,79.16129,73.806452,0.0,2.548387,57.0,85-89
2,Claiborne & Churchill,2013,94,San Luis Obispo,47.354839,68.548387,60.516129,1.2,21.387097,61.677419,...,0.2,5.322581,61.354839,61.290323,80.0,73.806452,0.0,2.516129,59.806452,90-94
3,Claiborne & Churchill,2014,93,San Luis Obispo,48.612903,68.322581,61.129032,1.8,22.290323,64.258065,...,0.3,10.032258,63.612903,62.096774,80.290323,73.967742,0.0,2.612903,63.774194,90-94
4,Claiborne & Churchill,2014,91,San Luis Obispo,48.612903,68.322581,61.129032,1.8,22.290323,64.258065,...,0.3,10.032258,63.612903,62.096774,80.290323,73.967742,0.0,2.612903,63.774194,90-94


In [156]:
# Visualize data: Data points dispersion into bins/categories
weatherClean.drop_duplicates().pointsRange.value_counts()

85-89     564
90-94     552
80-84      89
95-100     37
Name: pointsRange, dtype: int64

In [157]:
# Visualize data: Check Column names
weatherClean.columns

Index(['winery', 'vintage', 'points', 'cities', 'MarchLowTemp',
       'MarchHighTemp', 'MarchAvgTemp', 'MarchPrecip', 'MarchCloudiness',
       'MarchHumidity', 'AprilLowTemp', 'AprilHighTemp', 'AprilAvgTemp',
       'AprilPrecip', 'AprilCloudiness', 'AprilHumidity', 'MayLowTemp',
       'MayHighTemp', 'MayAvgTemp', 'MayPrecip', 'MayCloudiness',
       'MayHumidity', 'JuneLowTemp', 'JuneHighTemp', 'JuneAvgTemp',
       'JunePrecip', 'JuneCloudiness', 'JuneHumidity', 'JulyLowTemp',
       'JulyHighTemp', 'JulyAvgTemp', 'JulyPrecip', 'JulyCloudiness',
       'JulyHumidity', 'AugLowTemp', 'AugHighTemp', 'AugAvgTemp', 'AugPrecip',
       'AugCloudiness', 'AugHumidity', 'pointsRange'],
      dtype='object')

In [10]:
# Data Pre-Processing: Create X and y values for machine learning
X= weatherClean.drop(['winery', 'vintage', 'points', 'cities','pointsRange'], axis=1)
y= weatherClean['pointsRange']

print(X.shape, y.shape)

(1328, 36) (1328,)


In [11]:
#Split x-values into training(20%) and testing(80%) cohorts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=1)
#Scale data to create normalization
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

#Check scaled shape of each data set 
print(X_train_scaled.shape, X_test_scaled.shape)

(996, 36) (332, 36)


In [12]:
# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Step 2: Convert encoded labels to one-hot-encoding, four categories
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

print(y_train_categorical.shape, y_test_categorical.shape)

(996, 4) (332, 4)


In [13]:
# Create Deep Learning Model

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
model = Sequential()

# model.add(LSTM(64, return_sequences=True,
#                input_shape=(X_train_scaled.shape[1], X_train_scaled.shape[2])))


model.add(Dense(units=100, activation='relu', input_dim=X_train_scaled.shape[1]))
model.add(Dense(units=100, activation='relu',input_dim=X_train_scaled.shape[1]))
model.add(Dense(units=4, activation='softmax'))

In [14]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=250,
    shuffle=True,
    verbose=2
)

Epoch 1/250
32/32 - 1s - loss: 1.1250 - accuracy: 0.4187
Epoch 2/250
32/32 - 0s - loss: 1.0315 - accuracy: 0.4629
Epoch 3/250
32/32 - 0s - loss: 1.0203 - accuracy: 0.4689
Epoch 4/250
32/32 - 0s - loss: 1.0136 - accuracy: 0.4900
Epoch 5/250
32/32 - 0s - loss: 1.0072 - accuracy: 0.5131
Epoch 6/250
32/32 - 0s - loss: 1.0048 - accuracy: 0.5131
Epoch 7/250
32/32 - 0s - loss: 1.0044 - accuracy: 0.5171
Epoch 8/250
32/32 - 0s - loss: 0.9986 - accuracy: 0.5030
Epoch 9/250
32/32 - 0s - loss: 0.9975 - accuracy: 0.5161
Epoch 10/250
32/32 - 0s - loss: 0.9912 - accuracy: 0.5432
Epoch 11/250
32/32 - 0s - loss: 1.0030 - accuracy: 0.4970
Epoch 12/250
32/32 - 0s - loss: 0.9952 - accuracy: 0.5291
Epoch 13/250
32/32 - 0s - loss: 0.9942 - accuracy: 0.5181
Epoch 14/250
32/32 - 0s - loss: 0.9853 - accuracy: 0.5382
Epoch 15/250
32/32 - 0s - loss: 0.9848 - accuracy: 0.5472
Epoch 16/250
32/32 - 0s - loss: 0.9846 - accuracy: 0.5432
Epoch 17/250
32/32 - 0s - loss: 0.9837 - accuracy: 0.5361
Epoch 18/250
32/32 - 0s

Epoch 142/250
32/32 - 0s - loss: 0.8975 - accuracy: 0.5863
Epoch 143/250
32/32 - 0s - loss: 0.8921 - accuracy: 0.5894
Epoch 144/250
32/32 - 0s - loss: 0.8973 - accuracy: 0.5803
Epoch 145/250
32/32 - 0s - loss: 0.8866 - accuracy: 0.5924
Epoch 146/250
32/32 - 0s - loss: 0.8949 - accuracy: 0.5904
Epoch 147/250
32/32 - 0s - loss: 0.8943 - accuracy: 0.5934
Epoch 148/250
32/32 - 0s - loss: 0.8906 - accuracy: 0.5894
Epoch 149/250
32/32 - 0s - loss: 0.8939 - accuracy: 0.5873
Epoch 150/250
32/32 - 0s - loss: 0.8930 - accuracy: 0.5994
Epoch 151/250
32/32 - 0s - loss: 0.8875 - accuracy: 0.5984
Epoch 152/250
32/32 - 0s - loss: 0.9008 - accuracy: 0.5944
Epoch 153/250
32/32 - 0s - loss: 0.8923 - accuracy: 0.5884
Epoch 154/250
32/32 - 0s - loss: 0.9021 - accuracy: 0.5733
Epoch 155/250
32/32 - 0s - loss: 0.8898 - accuracy: 0.5904
Epoch 156/250
32/32 - 0s - loss: 0.8859 - accuracy: 0.5884
Epoch 157/250
32/32 - 0s - loss: 0.8877 - accuracy: 0.6084
Epoch 158/250
32/32 - 0s - loss: 0.8926 - accuracy: 0.59

In [15]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

11/11 - 0s - loss: 0.9594 - accuracy: 0.5663
Normal Neural Network - Loss: 0.9594460129737854, Accuracy: 0.5662650465965271


In [16]:
encoded_predictions = model.predict_classes(X_test_scaled)
prediction_labels = label_encoder.inverse_transform(encoded_predictions).tolist()




In [17]:
y_test1=y_test.tolist()

# print(type(y_test1))
# print(y_test1)

<class 'list'>


In [18]:
#Compare for percent accuracy
numRight= 0
for n in range (0, len(prediction_labels)):
    if prediction_labels[n]== y_test1[n]:
        numRight += 1   
print(numRight, (100*numRight/len(y_test1)))  

188 56.626506024096386


In [19]:
#Compare Test with Predictions
dfAccur=pd.DataFrame( )
dfAccur["y_test"]= y_test
dfAccur["y_predictions"]= prediction_labels
dfAccur.head(100)


Unnamed: 0,y_test,y_predictions
452,90-94,85-89
705,90-94,90-94
989,85-89,85-89
115,85-89,85-89
1152,85-89,85-89
...,...,...
321,85-89,90-94
1231,90-94,90-94
1047,85-89,85-89
111,85-89,85-89
