# Predicting a Pulsar Star

In [38]:
#load the necessary modules
import pandas as pd
import numpy as np
from scipy.io import loadmat
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

import datetime
import os

In [39]:
def model_fit_report(model,x_values,y_values_actual,y_values_predicted,y_values_probability):
    print("Model Fit Results")
    print("")
    print ("Accuracy: %0.3f" % accuracy_score(y_values_actual,y_values_predicted))
    print("")
    print("Confusion Matrix")
#     CM = plot_confusion_matrix(model,x_values,y_values_actual,values_format = 'n')
#     print (CM)
    print("")
#     print (confusion_matrix(y_values_actual,y_values_predicted))
    print("")
    print("Classification Report")
    print("")
    print (classification_report(y_values_actual,y_values_predicted))

    

In [40]:
path = '../ten-datasets/'

#load the dataset into a dataframe
df = pd.read_csv(path + 'pulsar_stars.csv')

In [41]:
df.head()

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [42]:
df.shape

(17898, 9)

In [43]:
df.target_class

0        0
1        0
2        0
3        0
4        0
        ..
17893    0
17894    0
17895    0
17896    0
17897    0
Name: target_class, Length: 17898, dtype: int64

In [44]:
y = df.iloc[:,-1]
X = df.iloc[:,0:-1]

X.shape
X.head()
y.head()

0    0
1    0
2    0
3    0
4    0
Name: target_class, dtype: int64

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.2, random_state = 0, stratify=y)

In [48]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)


(14318, 8)
(14318,)
(3580, 8)
(3580,)


In [47]:
from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
X_train = standardScaler.fit_transform(X_train)
X_test = standardScaler.transform(X_test)

# Shallow

## LogisticRegression

In [11]:
from sklearn.linear_model import LogisticRegression


In [12]:
lr = LogisticRegression(C=100,max_iter = 100, tol=.1,solver='sag')


In [13]:
%%time
lr.fit(X_train, y_train)


CPU times: user 14.8 ms, sys: 1.83 ms, total: 16.6 ms
Wall time: 15.4 ms


LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='sag', tol=0.1, verbose=0,
                   warm_start=False)

In [14]:
y_train_predict = lr.predict_proba(X_train).argmax(axis = 1)
print("Logistic train Accuracy: %.3f" % accuracy_score(y_train,y_train_predict))
model_fit_report(lr,X_train,y_train,y_train_predict,lr.predict_proba(X_train))

#get results for test dataset
y_test_predict = lr.predict_proba(X_test).argmax(axis = 1)
print("Logistic Test Accuracy: %.3f" % accuracy_score(y_test,y_test_predict))
model_fit_report(lr,X_test,y_test,y_test_predict,lr.predict_proba(X_test))

Logistic train Accuracy: 0.976
Model Fit Results

Accuracy: 0.976

Confusion Matrix


Classification Report

              precision    recall  f1-score   support

           0       0.98      1.00      0.99     13007
           1       0.95      0.79      0.86      1311

    accuracy                           0.98     14318
   macro avg       0.96      0.89      0.92     14318
weighted avg       0.98      0.98      0.98     14318

Logistic Test Accuracy: 0.979
Model Fit Results

Accuracy: 0.979

Confusion Matrix


Classification Report

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3252
           1       0.95      0.81      0.88       328

    accuracy                           0.98      3580
   macro avg       0.97      0.90      0.93      3580
weighted avg       0.98      0.98      0.98      3580



In [17]:
# from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor

In [21]:
%%time

tree = DecisionTreeRegressor()
# forest = RandomForestClassifier(criterion='gini', max_leaf_nodes=50,
#                                 n_estimators = 50, random_state =1,
#                                 n_jobs =-1,verbose=True, oob_score=True)
# forest.fit(TwoDim_X_train,y_train)
tree.fit(X_train,y_train)
# forest.fit(reduced_X_train,y_train)
# tree.oob_score_

CPU times: user 122 ms, sys: 3.79 ms, total: 126 ms
Wall time: 128 ms


DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [24]:
%%time
# y_train_predict = forest.predict_proba(TwoDim_X_train).argmax(axis = 1)
y_train_predict = tree.predict(X_train)
# y_train_predict = forest.predict_proba(reduced_X_train).argmax(axis = 1)
print ("Logistic Train Accuracy: %.3f" % accuracy_score(y_train,y_train_predict))
y_train_predict

Logistic Train Accuracy: 1.000
CPU times: user 4.98 ms, sys: 1.9 ms, total: 6.88 ms
Wall time: 5.26 ms


array([0., 0., 0., ..., 0., 0., 0.])

In [27]:
%%time

# y_test_predict = forest.predict_proba(TwoDim_X_test).argmax(axis = 1)
y_test_predict = tree.predict(X_test)
# y_test_predict = forest.predict_proba(reduced_X_test).argmax(axis = 1)
print("Logistic Test Accuracy: %.3f" % accuracy_score(y_test,y_test_predict))

Logistic Test Accuracy: 0.966
CPU times: user 2.36 ms, sys: 1.02 ms, total: 3.39 ms
Wall time: 4.2 ms


# deep learning

In [32]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score


## ANN



In [108]:
X_train.shape

(14318, 8)

In [121]:
ann_model = Sequential()
ann_model.add(Dense(
        units = 4, kernel_initializer = 'uniform', activation = 'relu', input_dim = X_train.shape[1]))

ann_model.add(Dense(units = 4, kernel_initializer = 'uniform', activation = 'relu'))

ann_model.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
ann_model.summary()


Model: "sequential_23"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_35 (Dense)             (None, 4)                 36        
_________________________________________________________________
dense_36 (Dense)             (None, 4)                 20        
_________________________________________________________________
dense_37 (Dense)             (None, 1)                 5         
Total params: 61
Trainable params: 61
Non-trainable params: 0
_________________________________________________________________


In [125]:
ann_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
ann_model.fit(X_train, y_train, batch_size=64, epochs=5)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x1a364edc50>

In [123]:
result = ann_model.evaluate(X_test, y_test)



In [124]:
print(model.metrics_names)
print(result)

['loss', 'accuracy']
[0.1088016588428167, 0.9784916043281555]


## CNN

In [52]:
from keras.models import Sequential
from keras.layers import Input, Conv1D, Conv2D, BatchNormalization, Activation, MaxPooling1D, MaxPooling2D, Dropout, Flatten, Dense

In [53]:
X_train_resh = X_train[:, :, np.newaxis]
X_test_resh = X_test[:, :, np.newaxis]
print(X_train_resh.shape)
print(y_train.shape)

print(X_test_resh.shape)
print(y_test.shape)

(14318, 8, 1)
(14318,)
(3580, 8, 1)
(3580,)


In [75]:
X_train_resh[0]

array([[-0.63397712],
       [-0.9819066 ],
       [-0.03306956],
       [-0.07815054],
       [-0.37014816],
       [-0.63049162],
       [ 0.7005112 ],
       [ 0.48032751]])

> 第一层卷积层，Conv2D(32, kernel_size=(3, 2), input_shape=(8,8,1))，Param=(3*2*1+1)*32 = 224. 第二层卷积层，Conv2D(64, (2, 3), activation='relu')，经过第一个层32个卷积核的作用，第二层输入数据通道数为32，Param=(2*3*32+1)*64 = 12352. 第三层卷积层，Conv2D(64, (2, 2), activation='relu')，经过第二个层64个卷积核的作用，第二层输入数据通道数为64，Param=(2*2*64+1)*64 = 16448. total_params = (filter_height * filter_width * input_image_channels + 1) * number_of_filters

In [166]:
#bn_axis = 3

model = Sequential()

model.add(Conv1D(filters = 16, kernel_size = 3, activation='tanh', input_shape=X_train_resh.shape[1:],padding = 'same', strides=1))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))

model.add(Conv1D(filters = 32, kernel_size = 3,padding = 'same',activation='tanh',))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(1024, activation='tanh'))
model.add(Dropout(0.25))
        
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])          

model.summary()

Model: "sequential_32"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_42 (Conv1D)           (None, 8, 16)             64        
_________________________________________________________________
max_pooling1d_26 (MaxPooling (None, 4, 16)             0         
_________________________________________________________________
dropout_43 (Dropout)         (None, 4, 16)             0         
_________________________________________________________________
conv1d_43 (Conv1D)           (None, 4, 32)             1568      
_________________________________________________________________
max_pooling1d_27 (MaxPooling (None, 2, 32)             0         
_________________________________________________________________
dropout_44 (Dropout)         (None, 2, 32)             0         
_________________________________________________________________
flatten_18 (Flatten)         (None, 64)              

In [167]:
# X_train.shape

In [168]:
for i in range(len(model.layers)):
     print(model.get_layer(index=i).output)

Tensor("conv1d_42/Tanh:0", shape=(?, 8, 16), dtype=float32)
Tensor("max_pooling1d_26/Squeeze:0", shape=(?, 4, 16), dtype=float32)
Tensor("dropout_43/cond/Merge:0", shape=(?, 4, 16), dtype=float32)
Tensor("conv1d_43/Tanh:0", shape=(?, 4, 32), dtype=float32)
Tensor("max_pooling1d_27/Squeeze:0", shape=(?, 2, 32), dtype=float32)
Tensor("dropout_44/cond/Merge:0", shape=(?, 2, 32), dtype=float32)
Tensor("flatten_18/Reshape:0", shape=(?, ?), dtype=float32)
Tensor("dense_52/Tanh:0", shape=(?, 1024), dtype=float32)
Tensor("dropout_45/cond/Merge:0", shape=(?, 1024), dtype=float32)
Tensor("dense_53/Sigmoid:0", shape=(?, 1), dtype=float32)


In [169]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_resh, y_train, batch_size=64, epochs=10)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x1a3af060b8>

In [170]:
result = model.evaluate(X_test_resh, y_test)



In [171]:
print(model.metrics_names)
print(result)

['loss', 'accuracy']
[0.0684083207109787, 0.9815642237663269]
