## From https://www.kaggle.com/parthsuresh/binary-classifier-using-keras-97-98-accuracy

In [98]:
# reset variables
from IPython import get_ipython
get_ipython().magic('reset -sf') 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.feature_selection import VarianceThreshold
from sklearn import preprocessing # To get MinMax Scaler function

# enable multiple outputs per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# hide warnings
import warnings
#warnings.filterwarnings('ignore')

In [109]:
# Importing libraries for building the neural network
from keras.models import Sequential
from keras.layers import Dense
from keras.models import model_from_json
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [100]:
# Baseline model for the neural network. We choose a hidden layer of 10 neurons. 
# The lesser number of neurons helps to eliminate the redundancies in the data and select the more important features.
def create_baseline():
    # create model
    model = Sequential()
    model.add(Dense(10, input_dim=52, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    # Compile model. We use the the logarithmic loss function, and the Adam gradient optimizer.
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [101]:
# Read data file
df_train = pd.read_csv('./datasets/train_extracted.csv'
                 , sep=',', encoding='utf-8')

df_test = pd.read_csv('./datasets/test_extracted.csv'
                 , sep=',', encoding='utf-8')

df_valid = pd.read_csv('./datasets/valid_extracted.csv'
                 , sep=',', encoding='utf-8')

#just in case of NaN values
df_train = df_train.dropna();
df_test = df_test.dropna();
df_valid = df_valid.dropna();

#data = pd.read_csv("../input/data.csv", header=0)
df = df_train
seed = 5
np.random.seed(seed)

In [102]:
# change label into binary classification
# pick out target data
df_train.label.loc[(df_train['label'] >= 0.5)] = 1;
df_train.label.loc[(df_train['label'] < 0.5)] = 0;
df_train_target = df_train['label']

df_test.label.loc[(df_test['label'] >= 0.5)] = 1;
df_test.label.loc[(df_test['label'] < 0.5)] = 0;
df_test_target = df_test['label']

df_valid.label.loc[(df_valid['label'] >= 0.5)] = 1;
df_valid.label.loc[(df_valid['label'] < 0.5)] = 0;
df_valid_target = df_valid['label']

In [103]:
#pick out feature data
df_train_feature = df_train.drop(['label', 'statement'], axis=1)
df_test_feature = df_test.drop(['label', 'statement'], axis=1)
df_valid_feature = df_valid.drop(['label', 'statement'], axis=1)

In [104]:
# remove low variance data
def variance_threshold(df, threshold=0.0):
    selector = VarianceThreshold(threshold)
    VT = selector.fit_transform(df)
    return df[df.columns[selector.get_support(indices=True)]]

# any column with a probability of having 0 variance above 0.8 will be eliminated
# any column with variance lower than 0.16 will be eliminated
df_train_feature = variance_threshold(df_train_feature, (.8 * (1 - .8)));

In [105]:
# remove the same low threshold features from test and valid set
feature_removed = list(set(df_test_feature.columns).difference(df_train_feature.columns))
df_test_feature = df_test_feature.drop(feature_removed, axis=1)
df_valid_feature = df_valid_feature.drop(feature_removed, axis=1)

In [106]:
# Initializing the MinMaxScaler function
min_max_scaler = preprocessing.MinMaxScaler()
#standard_scaler = preprocessing.StandardScaler()

# Scaling dataset keeping the columns name
df_train_feature_scaled = pd.DataFrame(min_max_scaler.fit_transform(df_train_feature), columns = df_train_feature.columns)
df_valid_feature_scaled = pd.DataFrame(min_max_scaler.fit_transform(df_valid_feature), columns = df_valid_feature.columns)
df_test_feature_scaled = pd.DataFrame(min_max_scaler.fit_transform(df_test_feature), columns = df_test_feature.columns)
#X_scaled = pd.DataFrame(standard_scaler.fit_transform(X), columns = X.columns)

In [107]:
X = df_train_feature_scaled
Y = df_train_target

In [108]:
# Evaluate model using standardized dataset. 
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, epochs=100, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True)
results = cross_val_score(pipeline, X, Y, cv=kfold)
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Results: 56.53% (1.27%)
