# Wazihub Soil Moisture Prediction Challenge

Predict soil humidity using sensor data from low-cost DIY Internet of Things in Senegal

In [2]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from scipy import stats

In [3]:
# display options
%matplotlib inline
sns.set_style('whitegrid')
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('max_colwidth', 400)
np.set_printoptions(linewidth=500)

# random seed
np.random.seed(6)

### Prepare the data

In [4]:
# get data
Train1 = pd.read_csv("../data/Train1.csv")
Limits = pd.read_csv("../data/Limits.csv", index_col=0)

In [5]:
# choose data for training
start = Limits["field1"]["train_start"]
end   = Limits["field1"]["train_end"]
data = Train1[(Train1["timestamp"]>=start) & (Train1["timestamp"]<=end)]
print("Shape of the data:", data.shape)

Shape of the data: (8914, 9)


In [6]:
def AddLag(df, lag=3, y="Humidity1"):
    data = pd.DataFrame()
    data[y] = df[y]
    for col in df.columns.values:
        for p in range(1, lag+1):
            name = col + "(k-" + str(p) + ")"
            nans = np.empty(p)
            nans[:] = np.nan
            data[name] = np.concatenate((nans, df[col].values[p:]))
    return data.dropna()

In [7]:
# add lag to data
data_lag = AddLag(data.drop(columns="timestamp"))

### Additional preprocessing

In [8]:
# import packages
import sklearn.model_selection as ms
from sklearn.preprocessing import RobustScaler, StandardScaler

In [9]:
# get labels
labels = np.array(data_lag["Humidity1"])

# get features
features = np.array(data_lag.drop(columns="Humidity1"))

In [20]:
# divide data into train and dev sets
X_train, X_dev = features[:-1153], features[-1153:]
y_train, y_dev = labels[:-1153], labels[-1153:]

In [23]:
# scale numerical features
scaler = RobustScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_dev = scaler.transform(X_dev)

In [27]:
import tensorflow as tf
print(tf.__version__)

1.14.0


### Model training

In [24]:
# import packages
import sklearn.linear_model as lm
from sklearn.metrics import mean_squared_error

In [25]:
# Ridge
print("Ridge:")
    
#evaluation on dev set
mdl = lm.Ridge()
y_hat = mdl.fit(X_train, y_train).predict(X_dev)
rmse = np.sqrt(mean_squared_error(y_dev, y_hat))
print("RMSE =", rmse)

Ridge:
RMSE = 0.0010662638582832093
