In [1]:
# Run preprocessing script to make dataframe objects available.

%run PreProcessing.ipynb

Current Progress: 100.0 %


In [2]:
# Import required libraries.

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.evaluate import bias_variance_decomp
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer

In [3]:
X = summary_data_avg[['pp2m2', 'heat_units', 'solar_radiation', 'organic_matter', 'method', 'variety', 'inputs', 'protection']]
y = summary_data_avg['mean_diameter']

X.head()

Unnamed: 0,pp2m2,heat_units,solar_radiation,organic_matter,method,variety,inputs,protection
0,38,1128,1063757,24.6,Drilled,Krypton,Conventional,Barley
1,41,1394,1236720,24.6,Drilled,Krypton,Conventional,Barley
2,44,1630,1348259,24.6,Drilled,Krypton,Conventional,Barley
3,45,1967,1506984,24.6,Drilled,Krypton,Conventional,Barley
4,40,2256,1619377,24.6,Drilled,Krypton,Conventional,Barley


In [4]:
# Encode categoriccal variables that will be used as predictors
# Error message appearing but code seems to work.

trns = CustomTransformer('ordinal').fit(X)
X = trns.transform(X)

X.head()

AttributeError: 'CustomTransformer' object has no attribute 'column_list'

In [None]:
# Assign 'X' and 'y' values and split into seperate train and test df.
# If there is little benefit from specific indicators they can be removed.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)
X_train.head()

In [None]:
X_test.head()

In [None]:
# Use the isolation forest algorithm to remove the most rogue 5% of datapoints.

print(f"The old training dataframe shape: {X_train.shape}")

clf1 = IsolationForest(max_samples=500, random_state = 10, contamination = 0.05)
preds = clf1.fit_predict(X_train)

totalOutliers=0
for pred in preds:
    if pred == -1:
        totalOutliers=totalOutliers+1
print("Total number of outliers identified is: ",totalOutliers)

# Remove outliers from training data.
# Do not remove from test data as this is bad practice.
# Confirm removal by checking new df shape.

filtered_array = []
for element in preds:
  if element != -1:
    filtered_array.append(True)
  else:
    filtered_array.append(False)
X_train = X_train[filtered_array]
y_train = y_train[filtered_array]

print(f"The new training dataframe shape: {X_train.shape}")

In [None]:
# Hypertuning model to find best parameters.
# Print the best qualitative parameters to use within the model.

param_grid_1 = {'kernel': ['linear', 'rbf', 'sigmoid'],
                'shrinking': [True, False]}
   
grid_svr = GridSearchCV(SVR(), param_grid_1, refit = True, verbose = 3, n_jobs=-1, cv = 2)

grid_svr.fit(MinMaxScaler().fit_transform(X_train), y_train) 
 
print(grid_svr.best_params_)

In [None]:
# Fit semi-hypertuned model and print score.

svr = Pipeline([('scaler', MinMaxScaler()), ('svr', SVR(kernel = 'rbf', shrinking = True))])

svr.fit(X_train, y_train)
print(svr.score(X_test, y_test))

In [None]:
def hypertune_eval(param, seq_min, seq_max, seq_int, model, log = False):
    
    
    if log:
        sequence = [i for i in np.logspace(seq_min, seq_max, seq_int)]
        
    else:
        sequence = np.arange(seq_min, seq_max, seq_int)
        
    acc = []
    bias = []
    var = []
    mse = []
    
    for n in sequence:
        model[-1].set_params(**{param : n})
        model.fit(X_train, y_train)
        acc.append(model.score(X_test, y_test))
        
        mse_decomp, bias_decomp, var_decomp = bias_variance_decomp(model, X_train.values, y_train.values, X_test.values, y_test.values, loss= 'mse', num_rounds = 10, random_seed = 1)
        
        bias.append(bias_decomp)
        var.append(var_decomp)
        mse.append(mse_decomp)
        
        
    fig, axs = plt.subplots(2, 2, figsize=(8, 5), constrained_layout=True, sharex=True)
    
    fig.suptitle(f'Effect of Changing {param} Parameter')
    fig.supxlabel(f'{param}')
    
    axs[0,0].plot(sequence, acc)
    axs[0,0].set_ylabel('accuracy')
    
    axs[0,1].plot(sequence, mse)
    axs[0,1].set_ylabel('mean squared error')
    
    axs[1,0].plot(sequence, bias)
    axs[1,0].set_ylabel('bias')
    
    axs[1,1].plot(sequence, var)
    axs[1,1].set_ylabel('variance')

In [None]:
# Optimisation of n_estimators parameter.

hypertune_eval('C', -0, 2.1, 8, svr, True)

In [None]:
# Set parameter to optimum.
svr[-1].set_params(C = 50)
svr.fit(X_train, y_train)
print(svr.score(X_test, y_test))

In [None]:
hypertune_eval('gamma', -3, 0, 8, svr, True)

In [None]:
# Set parameter to optimum.
svr[-1].set_params(gamma = 0.4)
svr.fit(X_train, y_train)
print(svr.score(X_test, y_test))

mse_decomp, bias_decomp, var_decomp = bias_variance_decomp(svr, X_train.values, y_train.values, X_test.values, y_test.values, loss= 'mse', num_rounds = 30, random_seed = 1)

print(mse_decomp**0.5)
print(bias_decomp)
print(var_decomp)

svr_pred = svr.predict(X_test)


In [None]:
# Join 'X' and 'y' test df together along with predictions to make visualisations easier.
# Please note... only the test data.

y_test_df = y_test.to_frame()
y_test_df = y_test_df.rename(columns = {0:'mean_diameter'}).reset_index(drop=True)

pred_df = pd.DataFrame(svr_pred, columns = ['predicted_mean_diameter']).reset_index(drop=True)

X_test_df = pd.DataFrame(X_test, columns = ['pp2m2',
                                            'heat_units',
                                            'solar_radiation',
                                            'organic_matter',
                                            'method',
                                            'variety',
                                            'inputs',
                                            'protection']).reset_index(drop = True)

test_df = X_test_df.copy()
test_df['predicted_mean_diameter'] = pd.Series(pred_df['predicted_mean_diameter'])
test_df['mean_diameter'] = pd.Series(y_test_df['mean_diameter'])

test_df = trns.inverse_transform(test_df)
test_df.head()

In [None]:
test_df.shape

In [None]:
# Scatterplot to show how predicted mean compares to actual mean.
# Set hue to see if any indicator variables have a bias.

sns.scatterplot(y = 'mean_diameter', x = 'predicted_mean_diameter', data = test_df, hue='method')
plt.axline([0, 0], [1, 1], color = 'r')
plt.ylim(0)
plt.xlim(0)
plt.show()