In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.linear_model import perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# Data Cleaning

In [18]:
excel = pd.ExcelFile('HainanData_kindofClean.xlsx')
hainan = excel.parse("Clean")
hainan.columns = hainan.columns.str.replace('\s+', '_')
hainan.columns = hainan.columns.str.replace('(', '')
hainan.columns = hainan.columns.str.replace(')', '')
hainan.drop(['Day', 'Year', 'Water/m3', 'Total_electricity_cons_kWh', 
             '50%_NaOH/kg', 'FeCl2/kg', 'PAM/kg', 'Defoamer/kg', 'day_#2'], axis = 1, inplace = True)
d = {'January':1, 'February':2, 'March':3, 'April':4, 'May':5, 'June':6,
     'July':7, 'August':8, 'September':9, 'October':10, 'November':11, 'December':12}
hainan.Month = hainan.Month.map(d)
hainan.BioCNG_Produced_Nm3 = hainan.BioCNG_Produced_Nm3.shift(-15)
hainan.drop(hainan.tail(15).index,inplace=True)
hainan = hainan[np.isfinite(hainan['Month'])]
hainan['BioCNG_cumsum'] = hainan.BioCNG_Produced_Nm3.cumsum()
print(hainan.columns)

Index(['Month', 'Month_#', 'Day_#', 'Raw_Biogas_Produced_m3',
       'BioCNG_Produced_Nm3', 'BioCNG_Sold_m3', 'Liquid_Fertilizer_Produced_t',
       'Liquid_fertilizer_sold_t', 'Solid_fertilizer_produced_t',
       'Solid_fertilizer_sold_t', 'Manure_input_t', 'Bagasse_1_input_t',
       'Lees_fermentation', 'Fish_waste_input_t', 'Alcowaste_input_t',
       'Cassava_input_t', 'Grass_input_t', 'Tea_waste_input_t',
       'Other_input_t', 'Total_input_t', 'Diesel_cons/L', 'BioCNG_cumsum'],
      dtype='object')


# Preliminary Work

In [19]:
hainan_train, hainan_test = train_test_split(hainan, test_size=0.2)
print('train data len:',len(hainan_train))
print('test data len:',len(hainan_test))

train data len: 814
test data len: 204


In [20]:
#ols
hainan_ols = ols("BioCNG_cumsum ~ Manure_input_t + Bagasse_1_input_t + Lees_fermentation +\
                    Fish_waste_input_t + Alcowaste_input_t + Cassava_input_t + Grass_input_t +\
                    Tea_waste_input_t + Other_input_t ", data=hainan_train).fit()
hainan_ols_summary = hainan_ols.summary()
hainan_ols_summary

0,1,2,3
Dep. Variable:,BioCNG_cumsum,R-squared:,0.386
Model:,OLS,Adj. R-squared:,0.379
Method:,Least Squares,F-statistic:,56.09
Date:,"Fri, 02 Mar 2018",Prob (F-statistic):,3.5500000000000006e-79
Time:,18:10:35,Log-Likelihood:,-12264.0
No. Observations:,814,AIC:,24550.0
Df Residuals:,804,BIC:,24600.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.363e+06,5.76e+04,23.674,0.000,1.25e+06,1.48e+06
Manure_input_t,4.035e+04,3667.781,11.000,0.000,3.31e+04,4.75e+04
Bagasse_1_input_t,-1.522e+04,1976.150,-7.701,0.000,-1.91e+04,-1.13e+04
Lees_fermentation,-1.428e+04,4385.996,-3.256,0.001,-2.29e+04,-5671.519
Fish_waste_input_t,1143.9100,5417.542,0.211,0.833,-9490.287,1.18e+04
Alcowaste_input_t,1412.3946,8052.111,0.175,0.861,-1.44e+04,1.72e+04
Cassava_input_t,-1200.2259,1177.602,-1.019,0.308,-3511.762,1111.311
Grass_input_t,1.371e+04,3935.271,3.485,0.001,5988.451,2.14e+04
Tea_waste_input_t,2.3e+04,1.89e+04,1.217,0.224,-1.41e+04,6.01e+04

0,1,2,3
Omnibus:,47.36,Durbin-Watson:,2.029
Prob(Omnibus):,0.0,Jarque-Bera (JB):,24.512
Skew:,0.247,Prob(JB):,4.76e-06
Kurtosis:,2.309,Cond. No.,102.0


In [21]:
#ols
hainan_ols = ols("Liquid_Fertilizer_Produced_t ~ Manure_input_t + Bagasse_1_input_t + Lees_fermentation +\
                    Fish_waste_input_t + Alcowaste_input_t + Cassava_input_t + Grass_input_t +\
                    Tea_waste_input_t + Other_input_t ", data=hainan_train).fit()
hainan_ols_summary = hainan_ols.summary()
hainan_ols_summary

0,1,2,3
Dep. Variable:,Liquid_Fertilizer_Produced_t,R-squared:,0.079
Model:,OLS,Adj. R-squared:,0.068
Method:,Least Squares,F-statistic:,7.623
Date:,"Fri, 02 Mar 2018",Prob (F-statistic):,8.74e-11
Time:,18:10:35,Log-Likelihood:,-4404.6
No. Observations:,814,AIC:,8829.0
Df Residuals:,804,BIC:,8876.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,21.8417,3.688,5.922,0.000,14.602,29.081
Manure_input_t,0.9273,0.235,3.946,0.000,0.466,1.389
Bagasse_1_input_t,0.0719,0.127,0.568,0.570,-0.177,0.320
Lees_fermentation,0.2768,0.281,0.985,0.325,-0.275,0.828
Fish_waste_input_t,1.0831,0.347,3.121,0.002,0.402,1.764
Alcowaste_input_t,-0.3898,0.516,-0.756,0.450,-1.402,0.623
Cassava_input_t,0.2692,0.075,3.568,0.000,0.121,0.417
Grass_input_t,-0.3033,0.252,-1.203,0.229,-0.798,0.192
Tea_waste_input_t,0.3954,1.211,0.327,0.744,-1.981,2.772

0,1,2,3
Omnibus:,228.012,Durbin-Watson:,2.07
Prob(Omnibus):,0.0,Jarque-Bera (JB):,506.614
Skew:,1.533,Prob(JB):,9.78e-111
Kurtosis:,5.354,Cond. No.,102.0


In [22]:
#ols
hainan_ols = ols("Solid_fertilizer_produced_t ~ Manure_input_t + Bagasse_1_input_t + Lees_fermentation +\
                    Fish_waste_input_t + Alcowaste_input_t + Cassava_input_t + Grass_input_t +\
                    Tea_waste_input_t + Other_input_t ", data=hainan_train).fit()
hainan_ols_summary = hainan_ols.summary()
hainan_ols_summary

0,1,2,3
Dep. Variable:,Solid_fertilizer_produced_t,R-squared:,0.091
Model:,OLS,Adj. R-squared:,0.081
Method:,Least Squares,F-statistic:,8.981
Date:,"Fri, 02 Mar 2018",Prob (F-statistic):,5.41e-13
Time:,18:10:35,Log-Likelihood:,-3233.2
No. Observations:,814,AIC:,6486.0
Df Residuals:,804,BIC:,6533.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.3758,0.875,0.430,0.668,-1.341,2.093
Manure_input_t,0.0470,0.056,0.844,0.399,-0.062,0.156
Bagasse_1_input_t,0.1660,0.030,5.527,0.000,0.107,0.225
Lees_fermentation,0.0631,0.067,0.947,0.344,-0.068,0.194
Fish_waste_input_t,0.3017,0.082,3.665,0.000,0.140,0.463
Alcowaste_input_t,-0.1323,0.122,-1.082,0.280,-0.372,0.108
Cassava_input_t,0.0380,0.018,2.125,0.034,0.003,0.073
Grass_input_t,-0.0407,0.060,-0.681,0.496,-0.158,0.077
Tea_waste_input_t,0.0390,0.287,0.136,0.892,-0.525,0.603

0,1,2,3
Omnibus:,682.122,Durbin-Watson:,2.089
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14941.885
Skew:,3.783,Prob(JB):,0.0
Kurtosis:,22.578,Cond. No.,102.0


From the ols results based on three different dependent variable, we can see that only the first one (cumulative sum of BioCNG) has a relatively high value for R-squared. So We decided to focus on the first one.

In [23]:
# Predict on the training data
X_train = hainan_train[['Manure_input_t', 'Bagasse_1_input_t',
       'Lees_fermentation', 'Fish_waste_input_t', 'Alcowaste_input_t',
       'Cassava_input_t', 'Grass_input_t', 'Tea_waste_input_t',
       'Other_input_t']]
y_train = hainan_train.BioCNG_cumsum
y_pred_train = hainan_ols.predict(X_train)

# Compute the root-mean-square of training data
rms_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
print('Training error:',rms_train/sum(y_train))

# Predict on the test data
X_test = hainan_test[['Manure_input_t', 'Bagasse_1_input_t',
       'Lees_fermentation', 'Fish_waste_input_t', 'Alcowaste_input_t',
       'Cassava_input_t', 'Grass_input_t', 'Tea_waste_input_t',
       'Other_input_t']]
y_test = hainan_test.BioCNG_cumsum
y_pred_test = hainan_ols.predict(X_test)

# Compute the root-mean-square of test data
rms_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
print('Testing error:',rms_test/sum(y_test))

hainan_train

Training error: 0.00146307722698
Testing error: 0.00583621962881


Unnamed: 0,Month,Month_#,Day_#,Raw_Biogas_Produced_m3,BioCNG_Produced_Nm3,BioCNG_Sold_m3,Liquid_Fertilizer_Produced_t,Liquid_fertilizer_sold_t,Solid_fertilizer_produced_t,Solid_fertilizer_sold_t,...,Lees_fermentation,Fish_waste_input_t,Alcowaste_input_t,Cassava_input_t,Grass_input_t,Tea_waste_input_t,Other_input_t,Total_input_t,Diesel_cons/L,BioCNG_cumsum
8,5.0,1,9,0.0,2108.0,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,21.30,40,18936.0
423,6.0,14,424,5169.0,4414.0,2077.00,0.00,0.00,0.00,0.00,...,0.00,8.28,0.00,0.00,0.00,0.00,0.00,24.48,0,1391217.0
1003,1.0,33,1004,6620.0,6411.0,4142.00,0.00,0.00,0.00,0.00,...,0.00,0.00,10.44,0.00,0.00,0.00,90.00,100.44,0,3643749.0
365,5.0,13,366,6402.0,7081.0,6932.00,0.00,0.00,38.32,38.32,...,0.00,0.00,0.00,0.00,0.00,0.00,1.52,26.60,0,1115278.0
172,10.0,6,173,0.0,2050.0,3976.12,0.00,0.00,5.00,5.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0,377506.0
75,7.0,3,76,0.0,3094.0,3764.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,51.76,0,177251.0
56,6.0,2,57,0.0,5215.0,3945.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,40.40,20,117573.0
919,11.0,31,920,3470.0,3400.0,7824.00,40.44,40.44,0.00,0.00,...,0.00,0.00,3.90,0.00,0.00,6.30,0.00,22.52,0,3053086.0
796,7.0,27,797,3902.0,1942.0,4740.00,43.40,43.40,0.00,0.00,...,11.84,0.00,0.00,0.00,0.00,0.00,0.00,17.32,0,2771038.0
357,4.0,12,358,5660.0,4022.0,36.00,40.42,40.42,10.36,10.36,...,0.00,8.82,0.00,0.00,0.00,0.00,5.58,21.70,0,1064296.0


# Training Algorithms

1. Logistic regression
2. SVM
3. Perceptron
4. kNN
5. Random Forest
6. xgBoost

In [37]:
#logistic regression
hainan['BioCNG_cumsum'] = pd.cut(hainan['BioCNG_cumsum'], bins=3, labels=[0, 1, 2])
hainan_train, hainan_test = train_test_split(hainan, test_size=0.2)
y_train = hainan_train.BioCNG_cumsum
y_test = hainan_test.BioCNG_cumsum
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_test = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(logreg.score(X_train, y_train)))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred_test)
print(confusion_matrix)

Accuracy of logistic regression classifier on train set: 0.61
Accuracy of logistic regression classifier on test set: 0.57
[[109  12]
 [ 76   7]]


In [38]:
# SVM
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, y_train)  
y_pred_test = clf.predict(X_test)
print('Accuracy of SVM on train set: {:.2f}'.format(clf.score(X_train, y_train)))
print('Accuracy of SVM on test set: {:.2f}'.format(clf.score(X_test, y_test)))

Accuracy of SVM on train set: 0.87
Accuracy of SVM on test set: 0.54


In [39]:
# perceptron
from sklearn.linear_model import perceptron
net = perceptron.Perceptron(n_iter=100, verbose=0, random_state=None, fit_intercept=True, eta0=0.002)
net.fit(X_train, y_train)
y_pred_test = net.predict(X_test)
print('Accuracy of perceptron on train set: {:.2f}'.format(net.score(X_train, y_train)))
print('Accuracy of perceptron on test set: {:.2f}'.format(net.score(X_test, y_test)))

Accuracy of perceptron on train set: 0.54
Accuracy of perceptron on test set: 0.48




In [40]:
# kNN
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(hainan)
hainan_normalized = pd.DataFrame(np_scaled)
hainan_normalized_train, hainan_normalized_test = train_test_split(hainan_normalized, test_size=0.2)
print(hainan_normalized_train.head())

Xnor_train = hainan_normalized_train[[10,11,12,13,14,15,16,17,18]]
ynor_train = hainan_normalized_train[21]
Xnor_test = hainan_normalized_test[[10,11,12,13,14,15,16,17,18]]
ynor_test = hainan_normalized_test[21]
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=4)
knn.fit(Xnor_train, ynor_train)
y_pred_test = knn.predict(Xnor_test)
print('Accuracy of kNN on train set: {:.2f}'.format(knn.score(Xnor_train, ynor_train)))
print('Accuracy of kNN on test set: {:.2f}'.format(knn.score(Xnor_test, ynor_test)))

           0         1         2         3         4         5    6    7   \
736  0.363636  0.727273  0.722277  0.350015  0.180375  0.207199  0.0  0.0   
884  0.818182  0.878788  0.869480  0.126127  0.000000  0.226739  0.0  0.0   
87   0.545455  0.060606  0.085378  0.000000  0.167491  0.248941  0.0  0.0   
639  0.000000  0.606061  0.627085  0.821965  0.286067  0.566727  0.0  0.0   
492  0.727273  0.484848  0.482826  0.138829  0.049813  0.000000  0.0  0.0   

      8    9  ...    12   13        14        15   16   17       18        19  \
736  0.0  0.0 ...   0.0  0.0  0.000000  0.000000  0.0  0.0  0.00000  0.114866   
884  0.0  0.0 ...   0.0  0.0  0.000000  0.000000  0.0  0.0  0.02963  0.072000   
87   0.0  0.0 ...   0.0  0.0  0.172831  0.000000  0.0  0.0  0.00000  0.145553   
639  0.0  0.0 ...   0.0  0.0  0.000000  0.630526  0.0  0.0  0.00000  0.322789   
492  0.0  0.0 ...   0.0  0.0  0.000000  0.000000  0.0  0.0  0.00000  0.000000   

           20   21  
736  0.000000  1.0  
884  0.0

In [43]:
# random forest
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators = 1000)
random_forest.fit(X_train, y_train)
print('Accuracy of Random Forest on train set: {:.2f}'.format(random_forest.score(X_train, y_train)))
print('Accuracy of Random Forest on test set: {:.2f}'.format(random_forest.score(X_test, y_test)))

Accuracy of Random Forest on train set: 0.97
Accuracy of Random Forest on test set: 0.59


From the accuracy results of the above algorithms training, we can see that random forest and kNN have relatively better results for accuracy. Random forest gives the best results, offering an accuracy of 0.57 on test sets.