In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings('ignore') # Suppress warnings



# Data Cleaning

Consolidate some columns

In [2]:
shenzhen = pd.read_csv("Data/shenzhen_cleaned_data.csv")
shenzhen['acid_feed'] = shenzhen['1_acidification_hydrolysis_tank_feed_'] \
                                             + shenzhen['2_acidification_hydrolysis_tank_feed_']
shenzhen['acid_discharge']  = shenzhen['1_acidification_hydrolysis_tank_discharge_']\
                                             + shenzhen['2_acidification_hydrolysis_tank_discharge_']
shenzhen['anaerobic_feed'] = shenzhen['1_Anaerobic_tank_slurry_feed_'] \
                                             + shenzhen['2_Anaerobic_tank_slurry_feed_']
shenzhen['anaerobic_cumuprod'] = shenzhen['1_Anaerobic_tank_biogas_cumulative_production_'] \
                                             + shenzhen['2_anaerobic_tank_biogas_cumulative_production_']
shenzhen['anaerobic_dailyoutput'] = shenzhen['1_anaerobic_tank_biogas_daily_output_'] \
                                             + shenzhen['2_anaerobic_tank_biogas_daily_output_']
shenzhen = shenzhen.drop(['1_acidification_hydrolysis_tank_feed_','2_acidification_hydrolysis_tank_feed_',\
               '1_acidification_hydrolysis_tank_discharge_','2_acidification_hydrolysis_tank_discharge_',\
               '1_Anaerobic_tank_slurry_feed_','2_Anaerobic_tank_slurry_feed_',\
               '1_Anaerobic_tank_biogas_cumulative_production_','2_anaerobic_tank_biogas_cumulative_production_',\
               '1_anaerobic_tank_biogas_daily_output_','2_anaerobic_tank_biogas_daily_output_'],axis = 1)

Shift rows up to account for production delay

In [3]:
shenzhen.acid_feed = shenzhen.acid_feed.shift(-15)
shenzhen.acid_discharge = shenzhen.acid_discharge.shift(-15)
shenzhen.anaerobic_feed = shenzhen.anaerobic_feed.shift(-15)
shenzhen.anaerobic_cumuprod = shenzhen.anaerobic_cumuprod.shift(-15)
shenzhen.anaerobic_dailyoutput = shenzhen.anaerobic_dailyoutput.shift(-15)
shenzhen = shenzhen.drop(['Unnamed: 0'], axis=1)
shenzhen = shenzhen[:-15]
shenzhen.head()

Unnamed: 0,Kitchen_waste_,Fruit_and_vegetable_waste_,Bread_Paste_,Waste_oil_,Total_Waste_,Diesel_waste_water_,Flour_and_waste_oil_,Kitchen_waste_paste_,acid_feed,acid_discharge,anaerobic_feed,anaerobic_cumuprod,anaerobic_dailyoutput
0,92.03,0.0,7.95,13.54,99.98,54.0,17.0,167.0,135.0,200.0,200.0,464356.0,9295.0
1,78.43,0.0,13.3,16.1,91.73,36.0,8.0,100.0,91.0,155.0,155.0,472980.0,8624.0
2,81.84,0.0,8.35,6.6,90.19,22.0,34.0,120.0,50.0,160.0,160.0,481426.0,8446.0
3,81.05,0.0,9.1,17.48,90.15,22.0,20.0,92.0,95.0,150.0,150.0,490210.0,8784.0
4,36.86,0.0,9.4,12.47,46.26,44.0,35.0,122.0,9.0,150.0,150.0,497959.0,7749.0


# Preliminary Work

Add a moving average column

In [4]:
shenzhen['daymean'] = np.nan
for index,row in shenzhen.iterrows():
    if index == 0:
        shenzhen.daymean[index] = shenzhen.anaerobic_dailyoutput[index]
    elif index == 1:
        shenzhen.daymean[index] = shenzhen.anaerobic_dailyoutput[index]
    elif index == 2:
        shenzhen.daymean[index] = shenzhen.anaerobic_dailyoutput[index]    
    else:
        shenzhen.daymean[index] = (shenzhen.daymean[index-1] + shenzhen.daymean[index-2] +\
                                   shenzhen.daymean[index-3])/3

Bin the outputs

In [5]:
shenzhen.anaerobic_dailyoutput = pd.cut(shenzhen['anaerobic_dailyoutput'], bins=3)
shenzhen.anaerobic_dailyoutput = pd.factorize(shenzhen.anaerobic_dailyoutput)[0]

Add inverse, square, and log columns

In [6]:
for col in shenzhen.columns[:10]:
    shenzhen['1/'+col] = 1/(shenzhen[col])
for col in shenzhen.columns[:10]:
    shenzhen[col+"**2"] = (shenzhen[col])**2
for col in shenzhen.columns[:10]:
    shenzhen[col+"log"] = np.log(shenzhen[col])

In [7]:
shenzhen.replace(float('inf'), 0, inplace=True)
shenzhen.replace(float('-inf'), 0, inplace=True)

In [8]:
train_data, test_data = train_test_split(shenzhen, test_size=0.15)

Set the train and test data

In [9]:
COLUMNS = [ 'Kitchen_waste_', 'Fruit_and_vegetable_waste_', 'Bread_Paste_',\
            'Waste_oil_', 'Total_Waste_', 'Diesel_waste_water_',\
            'Flour_and_waste_oil_', 'Kitchen_waste_paste_', 'acid_feed',\
            'acid_discharge','daymean','1/Kitchen_waste_', '1/Fruit_and_vegetable_waste_',\
            '1/Bread_Paste_', '1/Waste_oil_', '1/Total_Waste_',\
            '1/Diesel_waste_water_', '1/Flour_and_waste_oil_',\
            '1/Kitchen_waste_paste_', '1/acid_feed', '1/acid_discharge',\
            'Kitchen_waste_**2', 'Fruit_and_vegetable_waste_**2', \
            'Bread_Paste_**2','Waste_oil_**2', 'Total_Waste_**2', \
            'Diesel_waste_water_**2','Flour_and_waste_oil_**2', \
            'Kitchen_waste_paste_**2', 'acid_feed**2','acid_discharge**2',\
            'Kitchen_waste_log','Fruit_and_vegetable_waste_log', \
            'Bread_Paste_log', 'Waste_oil_log','Total_Waste_log', \
            'Diesel_waste_water_log', 'Flour_and_waste_oil_log',\
            'Kitchen_waste_paste_log', 'acid_feedlog', 'acid_dischargelog' ]

X_train = train_data[COLUMNS]
y_train = train_data.anaerobic_dailyoutput
X_test = test_data[COLUMNS]
y_test = test_data.anaerobic_dailyoutput

# Modeling

### Random Forest

In [10]:
random_forest = RandomForestClassifier(n_estimators=50)
random_forest.fit(X_train, y_train)

print('Accuracy of Random Forest on training set: {:.2f}'.format(random_forest.score(X_train, y_train)))
print('Accuracy of Random Forest on test set: {:.2f}'.format(random_forest.score(X_test, y_test)))

Accuracy of Random Forest on training set: 1.00
Accuracy of Random Forest on test set: 0.54


### XGBoost

In [11]:
xgb1 = XGBClassifier(
    learning_rate=0.1,
    n_estimators=1000,
    max_depth=3,
    min_child_weight=5,
    gamma=0.2,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=8,
    scale_pos_weight=8,
    seed=27)
xgb1.fit(X_train, y_train)

print('Accuracy of XGBoost on training set: {:.2f}'.format(xgb1.score(X_train, y_train)))
print('Accuracy of XGBoost on test set: {:.2f}'.format(xgb1.score(X_test, y_test)))

Accuracy of XGBoost on training set: 1.00
Accuracy of XGBoost on test set: 0.57
