In [29]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
from sklearn import preprocessing

In [30]:
def normalize_data(df,features):
    df_array = df.values
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(df_array)
    df = pd.DataFrame(x_scaled,columns=features)
    return df
    

In [31]:
def filter_data(data_path,labels_path):
    # load data and set index to city, year, weekofyear
    df = pd.read_csv(data_path, index_col=[0, 1, 2])

    # add labels to dataframe
    if labels_path:
        labels = pd.read_csv(labels_path, index_col=[0, 1, 2])
        df = df.join(labels)
   
    # drop missing values
    df = df.fillna(df.mean())
        
    # separate san juan and iquitos
    sj = df.loc['sj']
    iq = df.loc['iq']
    
    return sj, iq

In [32]:
def get_features_labels(dataset,count):
    train_subtrain = dataset.head(count)
    train_subtest = dataset.tail(dataset.shape[0] - count)
    
    return train_subtrain, train_subtest
    

In [33]:
#using lasso regression
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error

def lasso_regressor(train_data,validation_data, alpha,features,label):
    
    #training dataset features
    train_features = train_data[features]
    train_features = normalize_data(train_features,features)
    
    #training dataset lables
    train_labels = train_data[[label]]
    
    #validation dataset features
    valid_features = validation_data[features]
    valid_features = normalize_data(valid_features,features)
    
    #training dataset lables
    valid_labels = validation_data[[label]]
    
    #Fit the model
    lassoreg = Lasso(alpha=alpha,normalize=True, max_iter=1e5)
    lassoreg.fit(train_features,train_labels)
    y_pred = lassoreg.predict(valid_features)
    
    mae = mean_absolute_error(valid_labels,y_pred)
    
    print("Mean Absolute Error for :",alpha," ",mae)
    
    return lassoreg, mae

In [34]:
def get_best_model(train, validation,features,label,s,t):
    grid = 10 ** np.arange(s, t, dtype=np.float64)
                    
    best_alpha = []
    best_score = 1000
        
    for alpha in grid:
        model, mae = lasso_regressor(train,validation,alpha,features,label)

        if mae < best_score:
            best_alpha = alpha
            best_score = mae

    print('best alpha = ', best_alpha)
    print('best score = ', best_score)
            
    full_dataset = pd.concat([train, validation])
    fitted_model = lasso_regressor(full_dataset,validation,alpha,features,label)
    
    return fitted_model

In [35]:
train_sj, train_iq = filter_data("./data/dengue_features_train.csv","./data/dengue_labels_train.csv")
train_sj = train_sj.drop('week_start_date',axis=1)
train_iq = train_iq.drop('week_start_date',axis=1)

In [36]:
training_sj, validation_sj = get_features_labels(train_sj,800)
training_iq, validation_iq = get_features_labels(train_iq,400)

In [37]:
features = ['station_avg_temp_c','reanalysis_dew_point_temp_k','reanalysis_specific_humidity_g_per_kg','station_min_temp_c']
label = "total_cases"
model_sj,mae_sj = get_best_model(training_sj,validation_sj,features,label,-10,-1)


Mean Absolute Error for : 1e-10   22.49863836654915
Mean Absolute Error for : 1e-09   22.498637562492146
Mean Absolute Error for : 1e-08   22.49862952180735
Mean Absolute Error for : 1e-07   22.498549116159626
Mean Absolute Error for : 1e-06   22.497745050181706
Mean Absolute Error for : 1e-05   22.489704486170876
Mean Absolute Error for : 0.0001   22.40930230218315
Mean Absolute Error for : 0.001   21.61565956452088
Mean Absolute Error for : 0.01   20.098731004896088
best alpha =  0.01
best score =  20.098731004896088
Mean Absolute Error for : 0.01   19.04368553785062


In [38]:
model_iq,mae_iq = get_best_model(training_iq,validation_iq,features,label,-20,-10)

  positive)
  positive)


Mean Absolute Error for : 1e-20   6.851716568656809
Mean Absolute Error for : 1e-19   6.851716568656809
Mean Absolute Error for : 1e-18   6.851716568656989
Mean Absolute Error for : 1e-17   6.851716568656778
Mean Absolute Error for : 1e-16   6.851716568656642
Mean Absolute Error for : 1e-15   6.851716568656403
Mean Absolute Error for : 1e-14   6.851716568653456
Mean Absolute Error for : 1e-13   6.851716568623782
Mean Absolute Error for : 1e-12   6.851716568327221
Mean Absolute Error for : 1e-11   6.851716565361618
best alpha =  1e-11
best score =  6.851716565361618
Mean Absolute Error for : 1e-11   6.928975984434177


  positive)


In [39]:
deng_features_test_df = pd.read_csv("./data/dengue_features_test.csv", sep=",", index_col= [0,1,2])
pred_dataset = deng_features_test_df[features]
pred_dataset.head()


pred_dataset_sj = pred_dataset.loc['sj']
pred_dataset_sj.fillna(method='ffill', inplace=True)

pred_dataset_iq = pred_dataset.loc['iq']
pred_dataset_iq.fillna(method='ffill', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [40]:
pred_dataset_sj = normalize_data(pred_dataset_sj,features)
pred_dataset_iq = normalize_data(pred_dataset_iq,features)

In [41]:
predictions_sj = model_sj.predict(pred_dataset_sj).astype(int)
predictions_iq = model_iq.predict(pred_dataset_iq).astype(int)

In [42]:
submission = pd.read_csv("./data/submission_format.csv",
                         index_col=[0, 1, 2])

submission.total_cases = np.concatenate([predictions_sj, predictions_iq])
submission.to_csv("./data/submission_normalized.csv")