In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import os
import glob
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import json
import warnings
warnings.filterwarnings('ignore')

# MLP Classifier

### Load the MLP Model

In [3]:
from joblib import dump, load
# dump(clt, 'MLP_model.joblib') 

# load the MLP model
clt = load('MLP_model.joblib')

## Load the full data and merge with the testing datasets

### Load Full data

In [4]:
path = os.getcwd()
csv_files = glob.glob(os.path.join(path, "full/*.csv"))
each_Station = {}

# load each full file
for f in csv_files:
    each_Station[f[101:105].lower()] = pd.read_csv(f)
    
    temp = each_Station[f[101:105].lower()][each_Station[f[101:105].lower()].columns[3:26]]
    temp2 = each_Station[f[101:105].lower()][['Station', 'Ob']]
    
    temp = pd.DataFrame(StandardScaler().fit_transform(temp),columns = temp.columns).fillna(0)
    each_Station[f[101:105].lower()] = pd.concat([temp2, temp], axis=1, join='inner')
    
temp = []

# add each dataframe to a list
for key in each_Station:
    
    temp.append(each_Station[key])

# concat the dataframe, combine all rows
all_stations = pd.concat(temp)

In [5]:
# load test data
test = pd.read_csv("./test.csv")

In [6]:
# merge the testing data and all_station data on 'Station', 'Ob'
dfull = pd.merge(test, all_stations, how="inner", on=['Station', 'Ob'])

In [7]:
X = dfull.drop(['Station', 'Ob', 'value', 'measure'], axis=1)
X

Unnamed: 0,R_flag,I_flag,Z_flag,B_flag,temp_wxt,temp_hmp,rh_wxt,rh_hmp,ws10,wd10,...,sm,temp10,ws02,wd02,gust02,ws06,wd06,gust06,leafwetness,blackglobetemp
0,2,-1,2,0,0.839498,0.787337,-0.783049,-0.503103,0.090042,0.086811,...,0.982932,0.733379,-0.435431,-0.537254,-0.258877,0.182762,-0.306286,0.305015,-0.484017,1.496432
1,2,-1,-1,1,0.839498,0.787337,-0.783049,-0.503103,0.090042,0.086811,...,0.982932,0.733379,-0.435431,-0.537254,-0.258877,0.182762,-0.306286,0.305015,-0.484017,1.496432
2,2,-1,0,1,1.683680,1.665959,-0.552512,-0.431934,1.172638,-0.013927,...,-0.194344,1.605214,-0.201828,-0.188946,-0.071172,0.792664,-0.598164,2.297152,-0.475397,1.858563
3,2,-1,0,1,1.683680,1.665959,-0.305891,-0.555463,0.621299,0.014583,...,-0.194344,1.616265,-0.201828,-0.041178,-0.071172,0.155422,0.188553,0.698464,-0.475397,1.896822
4,3,0,-1,-1,1.683680,1.665959,-0.305891,-0.555463,0.621299,0.014583,...,-0.194344,1.616265,-0.201828,-0.041178,-0.071172,0.155422,0.188553,0.698464,-0.475397,1.896822
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1856101,0,0,2,-1,0.965494,0.968128,0.028117,0.011607,0.886390,1.412258,...,0.102690,1.060294,-0.625722,-1.947276,0.103367,1.203057,1.228551,0.921566,-0.687717,0.138841
1856102,0,0,2,-1,0.909523,0.908256,0.039687,0.053045,-0.139385,1.194936,...,0.102690,0.990324,0.109926,1.270578,0.103367,-0.300071,1.191845,-0.249791,-0.684616,0.098119
1856103,0,0,2,-1,0.685637,0.703578,0.637495,0.571206,0.786800,1.313111,...,0.071498,0.741859,0.530295,-2.278374,1.449815,0.145550,1.160238,0.305062,-0.648444,0.000599
1856104,0,0,2,-1,0.391788,0.422320,-0.029736,-0.046095,-0.188557,1.411257,...,-0.084457,0.467692,0.215018,1.405087,-0.064939,-0.157439,1.402902,-0.225256,-0.697018,-0.302678


### Predict y

In [8]:
y_pred = pd.DataFrame(clt.predict(X),columns = ['y_pred']) 
y_pred 

Unnamed: 0,y_pred
0,False
1,False
2,False
3,False
4,False
...,...
1856101,False
1856102,False
1856103,False
1856104,False


### Predict y probabilities

In [12]:
y_prob = clt.predict_proba(X) 

### Save prediction

In [13]:
pd.DataFrame(y_prob[:,1], columns=['target']).to_csv('predictions.csv', index=False) 