In [1]:
import numpy as np
from sklearn.neural_network import MLPClassifier

In [2]:
#read in provided data
provided_data = np.load('data/data_selected_1980_2010.npy').astype(np.float)
target = np.load('data/target_1980_2010.npy').astype(np.float)

In [3]:
#check the dimensions of provided data
provided_data.shape

(11300, 4)

In [4]:
#target has two columns and we only need one
target.shape

(11300, 2)

In [5]:
#make it so that there is only one column in target
target = target[:,1].reshape(11300,1)

In [6]:
target.shape

(11300, 1)

In [7]:
#there seems to be a problem with imbalanced target data 
#let's try to oversample under represented class with Synthetic Minority
# Oversampling technique (SMOTE)
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE().fit_sample(provided_data, target)

  y = column_or_1d(y, warn=True)


In [10]:
print X_resampled.shape
print y_resampled.shape

(21470, 4)
(21470,)


In [22]:
#let's split the date to train and test 
#since we have increased the number of samples required data spleat 
#will not do a good job
##########################################
#X_train = provided_data[:9497,:]
#X_test = provided_data[9497:,:]
#y_train = target[:9497,:]
#y_test = target[9497:,:]
##########################################
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.20, random_state=42)

In [23]:
#check the results
print X_train.shape
print X_test.shape
print y_train.shape
print y_test.shape

(17176, 4)
(4294, 4)
(17176,)
(4294,)


In [24]:
#now it is time to normalize the data (i.e. scale it)
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
# give scaler some more info to work with
normalizer.fit(X_train)

Normalizer(copy=True, norm='l2')

In [25]:
#now transform the data
X_train = normalizer.transform(X_train)
X_test = normalizer.transform(X_test)

In [26]:
#try to train the model
mlp = MLPClassifier()
mlp.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [27]:
#let's try our model
predictions = mlp.predict(X_test)

In [28]:
#now is the time to evaluate how we did
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test,predictions))

[[1416  733]
 [ 306 1839]]
             precision    recall  f1-score   support

        0.0       0.82      0.66      0.73      2149
        1.0       0.72      0.86      0.78      2145

avg / total       0.77      0.76      0.76      4294



In [29]:
from sklearn import metrics
fpr,tpr,thresholds = metrics.roc_curve(y_test, predictions, pos_label=2)

In [30]:
print fpr
print tpr

[ 0.          0.59897531  1.        ]
[ nan  nan  nan]


In [31]:
roc_auc = metrics.auc(fpr,tpr)

In [32]:
import matplotlib.pyplot as plt
plt.figure()
lw = 2
plt.plot(fpr[2], tpr[2], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()



IndexError: invalid index to scalar variable.