In [None]:
import tensorflow as tf

from imblearn.over_sampling import SMOTE
from collections import Counter
from numpy import * 
import os
import pandas as pd
import io
import requests
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping

In [None]:

def dataLoad():
    test = pd.read_csv('/data/tropical/real_data/test_real_tom_target.csv')
    train = pd.read_csv('/data/tropical/real_data/training_real_tom_target.csv')
    val = pd.read_csv('/data/tropical/real_data/validation_real_tom_target.csv')
    print(train.head())
    
    #Split data
    x_train = train[[ 'vo', 'r', 'u_200', 'u_850', 'v_200','v_850', 'ttr','sst']]
    print ("******training features******")
    #print (x_train)
    y_train = train[['Real_tom_lsm']]
    print ("******training target******")
    #print (y_train)
    
    x_test = test[[ 'vo', 'r', 'u_200', 'u_850', 'v_200','v_850', 'ttr','sst']]
    print ("******test features******")
    #print (x_test)
    y_test = test[['Real_tom_lsm']]
    print ("******test target******")
    #print (y_test)
    
    x_val = val[[ 'vo', 'r', 'u_200', 'u_850', 'v_200','v_850', 'ttr','sst']]
    print ("******validation features******")
    #print (x_val)
    y_val = val[['Real_tom_lsm']]
    print ("******validation target******")
    #print (y_val)
    
        
    return x_train, y_train, x_val, y_val, x_test, y_test

In [None]:
x_train, y_train, x_val, y_val, x_test, y_test = dataLoad()


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

#scaled_x_training = scaler.fit_transform(x_training)
#df = pd.DataFrame(StandardScaler().fit_transform(x_training))
#not smote
X_train_scaled = x_train.copy() 
X_valid_scaled = x_val.copy()
X_test_scaled = x_test.copy()


num_cols = [ 'vo', 'r', 'u_200', 'u_850', 'v_200','v_850', 'ttr','sst']

# apply standardization on numerical features
    
for i in num_cols:
    
    # fit on training data column
    scale = StandardScaler().fit(X_train_scaled[[i]])
    
    # transform the training data column
    X_train_scaled[i] = scale.transform(X_train_scaled[[i]])
    
    # transform the training data column
    X_valid_scaled[i] = scale.transform(X_valid_scaled[[i]])   

    # transform the testing data column
    X_test_scaled[i] = scale.transform(X_test_scaled [[i]])

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix

#ExtraTreesClassifier_class_weight
def get_models_class_weight(method):
    models_class_weight = dict()
    n_trees = [10, 50, 100, 500, 1000, 5000]
    min_samples_leaf = [10]
    #min_samples_leaf = [1, 10, 50, 100, 500, 1000, 5000]
    max_features= ['sqrt', 'log2', None] 
    
    if method == 'balanced':
        print("The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y))")
        for n in n_trees:
            for l in min_samples_leaf:
                for m in max_features:
                    models_class_weight[str(n)+'trees_'+str(l)+'leafSamples_'+str(m)+'maxFeat'] = ExtraTreesClassifier( n_estimators=n, min_samples_leaf=l, max_features=m, class_weight="balanced")
    elif method == 'balanced_subsample':
        print("The “balanced_subsample” mode is the same as “balanced” except that weights are computed based on the bootstrap sample for every tree grown.")
        for n in n_trees:
            for l in min_samples_leaf:
                for m in max_features:
                    models_class_weight[str(n)+'trees_'+str(l)+'leafSamples_'+str(m)+'maxFeat'] = ExtraTreesClassifier( n_estimators=n, min_samples_leaf=l, max_features=m, class_weight="balanced_subsample")
    else:
        print('the method is not callable')
    return models_class_weight

num = 0
# evaluate a given model using validation
def evaluate_model(model, x_train, y_train, x_val, y_val):
    trained_mod = model.fit(x_train, y_train)
    score = log_loss(y_val, model.predict_proba(x_val))
    conf = confusion_matrix(y_val, np.round(model.predict(x_val)))
    np.set_printoptions(precision=2)
    num += 1
    print(num)  
    return score,conf

In [None]:
models_class_weight_balanced = get_models_class_weight('balanced')


In [None]:
os.makedirs('results', exist_ok=True)  
results2, names2, confs2 = list(), list(), list()

print("#------------#--------------- class_weight_balanced is doing ---------------#------------#")

for name2, model in models_class_weight_balanced.items():
    res2 = evaluate_model(model, X_train_scaled, y_train.values.reshape(-1,), X_valid_scaled, y_val.values.reshape(-1,))
    print(name2)
    results2.append(res2[0])
    names2.append(name2)
    confs2.append(res2[1])
    
print("the class_weight_balanced is done ---->")

import pickle
sample_list = results2
file_name = "results/results_class_weight_balanced.pkl"

open_file = open(file_name, "wb")
pickle.dump(sample_list, open_file)
open_file.close()

print("the results for class_weight_balanced is saved on results_class_weight_balanced.pkl")

sample_list = names2
file_name = "results/names_class_weight_balanced.pkl"

open_file = open(file_name, "wb")
pickle.dump(sample_list, open_file)
open_file.close()

print("the names for class_weight_balanced is saved on names_class_weight_balanced.pkl")

sample_list = confs2
file_name = "results/confs_class_weight_balanced.pkl"

open_file = open(file_name, "wb")
pickle.dump(sample_list, open_file)
open_file.close()

print("the confs for class_weight_balanced is saved on names_class_weight_balanced.pkl")