In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

In [None]:
dataset =pd.read_excel('.../data/RFdata.xlsx')
dataset

In [None]:
dataset1=dataset.dropna(axis='index', how='any',subset=['phothermalerro_rev-til'])
dataset1.head()

In [None]:
dataset1.iloc[:, [54]].head()

In [None]:
dataset1.iloc[:, [2,3,4,11,12,13,14,15,16]].head()

In [None]:
# Prepare training Data
# Independent Variable ：latitude,longitude,altitude,average daily temperature, average maximum temperature, average minimum temperature,
#                       average sun shine hour, average rainfall, high temperature stress/low temperature damage days
# Dependent Variable：error days
x = dataset1.iloc[: , [2,3,4,11,12,13,14,15,16]].values
y = dataset1.iloc[:, 54].values

In [None]:
# The data is divided into training set and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=0)
print(x_train.shape)#View the amount of training set data
print(x_test.shape)#View the amount of test set data

In [None]:
regressor = RandomForestRegressor(n_estimators=200, random_state=0)
regressor.fit(x_train, y_train)
y_pred = regressor.predict(x_test)

In [None]:
from sklearn.metrics import roc_auc_score
print('Mean e3Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:',np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
train_pred = regressor.predict(x_train)
print('R^2:', metrics.r2_score(y_test, y_pred))

In [None]:
# Number of trees in Random Forest
rf_n_estimators =[int(x) for x in np.linspace(200, 500, 150)]
# Maximum number of levels in tree
rf_max_depth = [int(x) for x in np.linspace(5, 20, 11)]
# Add the default as a possible value
rf_max_depth.append(None)
# Number of features to consider at every split
rf_max_features = ['auto', 'sqrt', 'log2']
# Criterion to split on
rf_criterion = ['mse', 'mae']
# Minimum number of samples required to split a node
rf_min_samples_split = [int(x) for x in np.linspace(2, 10, 9)]
# Minimum decrease in impurity required for split to happen
rf_min_impurity_decrease = [0.0, 0.05, 0.1]
# Method of selecting samples for training each tree
rf_bootstrap = [True, False]
rf_random_state = [0,42]
# Create the grid
rf_grid = {'n_estimators': rf_n_estimators, 'max_depth': rf_max_depth, 'max_features': rf_max_features, 'criterion': rf_criterion,
           'min_samples_split': rf_min_samples_split, 'min_impurity_decrease': rf_min_impurity_decrease, 'bootstrap': rf_bootstrap,
           'random_state': rf_random_state}

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Create the model to be tuned
rf_base = RandomForestRegressor()
# Create the random search Random Forest
rf_random = RandomizedSearchCV(estimator = rf_base, param_distributions = rf_grid, 
                               n_iter = 200, cv = 3, verbose = 2, random_state = 0, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_train, y_train)
# View the best parameters from the random search
rf_random.best_params_

In [None]:
from sklearn.linear_model import LinearRegression
# Create the final Multiple Linear Regression
mlr_final = LinearRegression()
# Create the final Random Forest
rf_final = RandomForestRegressor(n_estimators = 258, min_samples_split = 6, min_impurity_decrease = 0.0, 
                                 max_features = 'sqrt', max_depth = 5,  criterion = 'mse',
                                 bootstrap = True, random_state =42)
# Train the models using 70% of the original data
mlr_final.fit(x_train, y_train)
rf_final.fit(x_train, y_train)

In [None]:
train_pred = rf_final.predict(x_train)
metrics.r2_score(y_train, train_pred)

In [None]:
test_pred =  rf_final.predict(x_test)
mse_test = metrics.mean_squared_error(y_test,test_pred)
mse_test

In [None]:
## Define a function that compares all final models
def final_comparison(models, test_features, test_labels):    
    scores = pd.DataFrame()    
    for model in models:        
        predictions = model.predict(test_features)     
        mae = round(metrics.mean_absolute_error(test_labels, predictions), 4)        
        mse = round(metrics.mean_squared_error(test_labels, predictions), 4)        
        r2 = round(metrics.r2_score(test_labels, predictions), 4)        
        errors = abs(predictions - test_labels) 
        test_labels[test_labels==0]=1
        mape = 100 * np.mean(errors /test_labels)        
        accuracy = round(100 - mape, 4)        
        scores[str(model)] = [mae, mse, r2, accuracy]    
        scores.index = ['Mean Absolute Error', 'Mean Squared Error', 'R^2', 'Accuracy']    
    return scores

In [None]:
# Call the comparison function with the three final models
final_scores = final_comparison([mlr_final,rf_final], x_test, y_test)  #regressor / rf_final
# Adjust the column headers
final_scores.columns  = ['Linear Regression', 'Random Forest']
final_scores

In [None]:
rf_final.feature_importances_

In [None]:
## Get numerical feature importances
importances = list(rf_final.feature_importances_)
# List of tuples with variable and importance
print(importances)
# Saving feature names for later use
# feature_list = list(dataset.columns)[2:23]
feature_list = ['lat','lon','alt','MeanTmax_Rev-Til','MeanTmin_Rev-Til','MeanTave_Rev-Til',
                'MeanSunhours_Rev-Til','MeanRain_Rev-Til','MeanHumidity_Rev-Til']

feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

In [None]:
# Import matplotlib for plotting and use magic command for Jupyter Notebooks
import matplotlib.pyplot as plt
# Set the style
# plt.style.use('fivethirtyeight')
plt.figure(figsize=(10, 6))
# list of x locations for plotting
x_values = list(range(len(importances)))
# Make a bar chart
plt.bar(x_values, importances, orientation = 'vertical')
# Tick labels for x axis
plt.xticks(x_values, feature_list,rotation=90,fontsize=13)
plt.yticks(fontsize=13)
# Axis labels and title
plt.ylabel('Importance',fontsize=15); plt.xlabel('Variable',fontsize=15); plt.title('Variable Importances',fontsize=15);
plt.show()