In [None]:
import tensorflow as tf

from imblearn.over_sampling import SMOTE
from collections import Counter
from numpy import * 
import os
import pandas as pd
import io
import requests
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
def dataLoadSm():
    test = pd.read_csv('/data/tropical/real_data/test_real_tom_target.csv')
    train = pd.read_csv('/data/tropical/real_data/training_real_tom_target.csv')
    val = pd.read_csv('/data/tropical/real_data/validation_real_tom_target.csv')
    print(train.head())
    
    #Split data
    x_train = train[[ 'vo', 'r', 'u_200', 'u_850', 'v_200','v_850', 'ttr','sst']]
    print ("******training features******")
    #print (x_train)
    y_train = train[['Real_tom_lsm']]
    print ("******training target******")
    #print (y_train)
    
    x_test = test[[ 'vo', 'r', 'u_200', 'u_850', 'v_200','v_850', 'ttr','sst']]
    print ("******test features******")
    #print (x_test)
    y_test = test[['Real_tom_lsm']]
    print ("******test target******")
    #print (y_test)
    
    x_val = val[[ 'vo', 'r', 'u_200', 'u_850', 'v_200','v_850', 'ttr','sst']]
    print ("******validation features******")
    #print (x_val)
    y_val = val[['Real_tom_lsm']]
    print ("******validation target******")
    #print (y_val)
    
    print('Before Smote')

    dff = y_train
    M = y_train.to_numpy()
    # summarize dataset
    classes = unique(M)
    print(classes)
    total = len(M)
    for c in classes:
        n_examples = len(M[M==c])
        percent = n_examples / total * 100
        print('> Class=%d : %d/%d (%.1f%%)' % (c, n_examples, total, percent))
    
    
    smt = SMOTE()
    
    X_train_sm, Y_train_sm = smt.fit_resample(x_train, y_train)
    
    print('After SMOTE')
    dff = Y_train_sm
    M = Y_train_sm.to_numpy()
    # summarize dataset
    classes = unique(M)
    print(classes)
    total = len(M)
    for c in classes:
        n_examples = len(M[M==c])
        percent = n_examples / total * 100
        print('> Class=%d : %d/%d (%.1f%%)' % (c, n_examples, total, percent))
        
    return X_train_sm, Y_train_sm, x_val, y_val, x_test, y_test

In [None]:
x_train_sm, y_train_sm, x_val, y_val, x_test, y_test = dataLoadSm()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

#scaled_x_training = scaler.fit_transform(x_training)
#df = pd.DataFrame(StandardScaler().fit_transform(x_training))

X_train_stand = x_train_sm.copy() #smote
X_valid_stand = x_val.copy() #smote
X_test_stand = x_test.copy() #smote


num_cols = [ 'vo', 'r', 'u_200', 'u_850', 'v_200','v_850', 'ttr','sst']

# apply standardization on numerical features
for i in num_cols:
    
    # fit on training data column
    scale = StandardScaler().fit(X_train_stand[[i]])
    
    # transform the training data column
    X_train_stand[i] = scale.transform(X_train_stand[[i]])
    
    # transform the training data column
    X_valid_stand[i] = scale.transform(X_valid_stand[[i]])   

    # transform the testing data column
    X_test_stand[i] = scale.transform(X_test_stand[[i]])

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix

# get a list of models to evaluate
def get_models():
    models = dict()
    n_trees = [10, 50, 100, 500, 1000, 5000]
    min_samples_leaf = [50]
    #min_samples_leaf = [1, 10, 50, 100, 500, 1000, 5000]
    max_features= ['sqrt', 'log2', None] 
    for n in n_trees:
        for l in min_samples_leaf:
            for m in max_features:
                models[str(n)+'trees_'+str(l)+'leafSamples_'+str(m)+'maxFeat'] = ExtraTreesClassifier(n_estimators=n, min_samples_leaf=l, max_features=m, n_jobs=-1)
    return models

# evaluate a given model using validation
def evaluate_model(model, x_train, y_train, x_val, y_val):
    trained_mod = model.fit(x_train, y_train)
    score = log_loss(y_val, model.predict_proba(x_val))
    conf = confusion_matrix(y_val, np.round(model.predict(x_val)))
    np.set_printoptions(precision=2)
    return score,conf

In [None]:
models = get_models()

In [None]:
os.makedirs('results', exist_ok=True)  

print("#------------#--------------- Smote is doing ---------------#------------#")

results1, names1, confs1 = list(), list(), list()
for name1, model in models.items():
    res1 = evaluate_model(model, X_train_stand, y_train_sm.values.reshape(-1,), X_valid_scaled, y_val.values.reshape(-1,))
    results1.append(res1[0])
    names1.append(name1)
    confs1.append(res1[1])
    
print("the smote is done ---->")

import pickle
sample_list = results1
file_name = "results/results_smote.pkl"

open_file = open(file_name, "wb")
pickle.dump(sample_list, open_file)
open_file.close()

print("the results for smote is saved on results_smote.pkl")

sample_list = names1
file_name = "results/names_smote.pkl"

open_file = open(file_name, "wb")
pickle.dump(sample_list, open_file)
open_file.close()

print("the name for smote is saved on names_smote.pkl")


sample_list = confs1
file_name = "results/confs_smote.pkl"

open_file = open(file_name, "wb")
pickle.dump(sample_list, open_file)
open_file.close()

print("the confs for smote is saved on confs_smote.pkl")



In [None]:
print("yuhooo")