In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
import warnings

warnings.filterwarnings('ignore') # Suppress warnings

# Data Cleaning

Clean column names

In [2]:
hainan = pd.read_csv('data/hainan_cleaned_data.csv')

hainan.columns = hainan.columns.str.replace('  ', '_')
hainan.columns = hainan.columns.str.replace(' ', '_')
hainan.columns = hainan.columns.str.replace('(', '')
hainan.columns = hainan.columns.str.replace('（', '')
hainan.columns = hainan.columns.str.replace(')', '')

d = { 'January':1, 'February':2, 'March':3, 'April':4, 'May':5, 'June':6,
     'July':7, 'August':8, 'September':9, 'October':10, 'November':11, 'December':12 }

hainan.Month = hainan.Month.map(d)

Shift rows up to account for delay

In [3]:
hainan.BioCNG_Produced_Nm3 = hainan.BioCNG_Produced_m3.shift(-15)
hainan.drop(hainan.tail(15).index, inplace=True)

Clean up column values

In [4]:
hainan = hainan[np.isfinite(hainan['Month'])]
hainan = hainan[np.isfinite(hainan['Lemon_waste_t'])]
hainan = hainan[np.isfinite(hainan['Percolate_t'])]
hainan = hainan.replace(' ', 0)
hainan = hainan.replace('', 0)
hainan = hainan.replace('  ', 0)

Remove unnecessary columns

In [5]:
hainan = hainan.drop(['Year', 'Month', 'Day', 'Month_#', 'Day_#', 'Raw_Biogas_Produced_m3', 'BioCNG_Sold_m3', 'Vehicle_use_m3',
       'Liquid_Fertilizer_Produced_t', 'Solid_fertilizer_produced_t',
       'Wastewater_flow_to_WWTP_unit?', 'Solid_residues_kg','50%_NaOH/kg', 'FeCl2/kg', 'PAM/kg',
       'Defoamer/kg', 'Project_electricity_use/kWh',
       'Office_space_electricity_use/kWh', 'Water/m3', 'Diesel/L', 'extra'], axis=1)

hainan.head()

Unnamed: 0,BioCNG_Produced_m3,Pig_Manure_t,Cassava_t,Fish_waste_water_t,Kitchen_food_waste_t,Municipal_fecal_residue_t,Tea_waste_t,Chicken_litter_t,Bagasse_feed_t,Alcohol_waste_t,Chinese_medicine_waste_t,Energy_grass_t,Banana_fruit_shafts_t,Lemon_waste_t,Percolate_t,Other_waste_t
0,2024.0,6.82,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,26.44
1,2909.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,28.92
2,3020.0,6.66,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,26.9
3,3042.0,12.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2779.0,9.54,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Preliminary Work

Add an inverse column

In [6]:
for col in hainan.columns[1:]:
    hainan['1/'+col] = 1/(hainan[col])

hainan.replace(float('inf'), 0, inplace=True)

Add a squared column

In [7]:
for col in hainan.columns[1:]:
    hainan[col+"**2"] = (hainan[col])**2

hainan.head()

Unnamed: 0,BioCNG_Produced_m3,Pig_Manure_t,Cassava_t,Fish_waste_water_t,Kitchen_food_waste_t,Municipal_fecal_residue_t,Tea_waste_t,Chicken_litter_t,Bagasse_feed_t,Alcohol_waste_t,...,1/Tea_waste_t**2,1/Chicken_litter_t**2,1/Bagasse_feed_t**2,1/Alcohol_waste_t**2,1/Chinese_medicine_waste_t**2,1/Energy_grass_t**2,1/Banana_fruit_shafts_t**2,1/Lemon_waste_t**2,1/Percolate_t**2,1/Other_waste_t**2
0,2024.0,6.82,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,...,0.0,0.0,0.004444,0.0,0.0,0.0,0.0,0.0,0.0,0.00143
1,2909.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,...,0.0,0.0,0.004444,0.0,0.0,0.0,0.0,0.0,0.0,0.001196
2,3020.0,6.66,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,...,0.0,0.0,0.004444,0.0,0.0,0.0,0.0,0.0,0.0,0.001382
3,3042.0,12.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2779.0,9.54,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,...,0.0,0.0,0.012346,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Modeling

### KNN

Normalize data

In [8]:
COLUMNS = [ 'BioCNG_Produced_m3', 'Pig_Manure_t', 'Cassava_t', 'Fish_waste_water_t',
            'Kitchen_food_waste_t', 'Municipal_fecal_residue_t', 'Tea_waste_t',
            'Chicken_litter_t', 'Bagasse_feed_t', 'Alcohol_waste_t',
            'Chinese_medicine_waste_t', 'Energy_grass_t', 'Banana_fruit_shafts_t',
            'Lemon_waste_t', 'Percolate_t', 'Other_waste_t', '1/Pig_Manure_t',
            '1/Cassava_t', '1/Fish_waste_water_t', '1/Kitchen_food_waste_t',
            '1/Municipal_fecal_residue_t', '1/Tea_waste_t', '1/Chicken_litter_t',
            '1/Bagasse_feed_t', '1/Alcohol_waste_t', '1/Chinese_medicine_waste_t',
            '1/Energy_grass_t', '1/Banana_fruit_shafts_t', '1/Lemon_waste_t',
            '1/Percolate_t', '1/Other_waste_t', 'Pig_Manure_t**2', 'Cassava_t**2',
            'Fish_waste_water_t**2', 'Kitchen_food_waste_t**2',
            'Municipal_fecal_residue_t**2', 'Tea_waste_t**2', 'Chicken_litter_t**2',
            'Bagasse_feed_t**2', 'Alcohol_waste_t**2',
            'Chinese_medicine_waste_t**2', 'Energy_grass_t**2',
            'Banana_fruit_shafts_t**2', 'Lemon_waste_t**2', 'Percolate_t**2',
            'Other_waste_t**2', '1/Pig_Manure_t**2', '1/Cassava_t**2',
            '1/Fish_waste_water_t**2', '1/Kitchen_food_waste_t**2',
            '1/Municipal_fecal_residue_t**2', '1/Tea_waste_t**2',
            '1/Chicken_litter_t**2', '1/Bagasse_feed_t**2', '1/Alcohol_waste_t**2',
            '1/Chinese_medicine_waste_t**2', '1/Energy_grass_t**2',
            '1/Banana_fruit_shafts_t**2', '1/Lemon_waste_t**2', '1/Percolate_t**2',
            '1/Other_waste_t**2' ]

min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(hainan[COLUMNS])
hainan_normalized = pd.DataFrame(np_scaled)

Set the train and test data

In [9]:
train_data, test_data = train_test_split(hainan_normalized, test_size=0.2)
X_train = train_data.iloc[:, 1:62]    
y_train = train_data[0]
X_test = test_data.iloc[:, 1:62]
y_test = test_data[0]

In [10]:
knn = KNeighborsRegressor(n_neighbors=7)
knn.fit(X_train, y_train)

print('Accuracy of kNN on training set: {:.2f}'.format(knn.score(X_train, y_train)))
print('Accuracy of kNN on test set: {:.2f}'.format(knn.score(X_test, y_test)))

Accuracy of kNN on training set: 0.90
Accuracy of kNN on test set: 0.79


# Feature Importance

In [11]:
knn = KNeighborsRegressor(n_neighbors=7)
feature_scores = []
n = len(hainan.columns)-1
for i in range(1,n):
    Xi = X_train[i].reshape(-1, 1)
    Xj = X_test[i].reshape(-1, 1)    
    knn.fit(Xi, y_train.reshape(-1, 1))
    feature_scores.append(knn.score(Xj, y_test))

important_features = pd.DataFrame(feature_scores)
important_features['index'] = range(n-1)
important_features = important_features.sort_values(by=0, ascending=False).iloc[0:10]
features = hainan.columns[1:n]

f = []
for i in important_features['index']:
    f.append(features[i])
    
important_features['feature'] = f
important_features.drop(['index'], axis=1, inplace=True)
important_features.columns = ['accuracy_score', 'feature']
important_features = important_features.reindex_axis(['feature','accuracy_score'], axis=1)
important_features

Unnamed: 0,feature,accuracy_score
3,Kitchen_food_waste_t,0.530681
33,Kitchen_food_waste_t**2,0.528403
43,Percolate_t**2,0.495992
13,Percolate_t,0.493762
18,1/Kitchen_food_waste_t,0.493198
48,1/Kitchen_food_waste_t**2,0.492847
49,1/Municipal_fecal_residue_t**2,0.474362
19,1/Municipal_fecal_residue_t,0.473205
21,1/Chicken_litter_t,0.459929
51,1/Chicken_litter_t**2,0.458959
