# Predicting a Pulsar Star

In [213]:
#load the necessary modules
import pandas as pd
import numpy as np
from scipy.io import loadmat
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

import datetime
import os

In [214]:
def model_fit_report(model,x_values,y_values_actual,y_values_predicted,y_values_probability):
    print("Model Fit Results")
    print("")
    print ("Accuracy: %0.3f" % accuracy_score(y_values_actual,y_values_predicted))
    print("")
    print("Confusion Matrix")
#     CM = plot_confusion_matrix(model,x_values,y_values_actual,values_format = 'n')
#     print (CM)
    print("")
#     print (confusion_matrix(y_values_actual,y_values_predicted))
    print("")
    print("Classification Report")
    print("")
    print (classification_report(y_values_actual,y_values_predicted))

    

In [224]:
path = '../ten-datasets/'

#load the dataset into a dataframe
df = pd.read_csv(path + 'pulsar_stars.csv')

In [225]:
df.head()

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [226]:
df.shape

(17898, 9)

In [227]:
df.target_class

0        0
1        0
2        0
3        0
4        0
        ..
17893    0
17894    0
17895    0
17896    0
17897    0
Name: target_class, Length: 17898, dtype: int64

In [228]:
y = df.iloc[:,-1]
X = df.iloc[:,0:-1]

X.head()
# y.head()
X.shape


(17898, 8)

In [229]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.2, random_state = 0, stratify=y)

In [230]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)


(14318, 8)
(14318,)
(3580, 8)
(3580,)


In [231]:
from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
X_train = standardScaler.fit_transform(X_train)
X_test = standardScaler.transform(X_test)

# Shallow

## LogisticRegression

In [232]:
from sklearn.linear_model import LogisticRegression


In [233]:
lr = LogisticRegression(C=100,max_iter = 100, tol=.1,solver='sag')


In [234]:
%%time
lr.fit(X_train, y_train)


CPU times: user 14.6 ms, sys: 7.85 ms, total: 22.4 ms
Wall time: 27.4 ms


LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='sag', tol=0.1, verbose=0,
                   warm_start=False)

In [235]:
y_train_predict = lr.predict_proba(X_train).argmax(axis = 1)
print("Logistic train Accuracy: %.3f" % accuracy_score(y_train,y_train_predict))
model_fit_report(lr,X_train,y_train,y_train_predict,lr.predict_proba(X_train))

#get results for test dataset
y_test_predict = lr.predict_proba(X_test).argmax(axis = 1)
print("Logistic Test Accuracy: %.3f" % accuracy_score(y_test,y_test_predict))
model_fit_report(lr,X_test,y_test,y_test_predict,lr.predict_proba(X_test))

Logistic train Accuracy: 0.977
Model Fit Results

Accuracy: 0.977

Confusion Matrix


Classification Report

              precision    recall  f1-score   support

           0       0.98      0.99      0.99     13007
           1       0.94      0.80      0.86      1311

    accuracy                           0.98     14318
   macro avg       0.96      0.90      0.93     14318
weighted avg       0.98      0.98      0.98     14318

Logistic Test Accuracy: 0.979
Model Fit Results

Accuracy: 0.979

Confusion Matrix


Classification Report

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3252
           1       0.94      0.82      0.88       328

    accuracy                           0.98      3580
   macro avg       0.96      0.91      0.93      3580
weighted avg       0.98      0.98      0.98      3580



## DecisionTree

In [236]:
# from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor

In [237]:
%%time

tree = DecisionTreeRegressor()
# forest = RandomForestClassifier(criterion='gini', max_leaf_nodes=50,
#                                 n_estimators = 50, random_state =1,
#                                 n_jobs =-1,verbose=True, oob_score=True)
# forest.fit(TwoDim_X_train,y_train)
forest.fit(X_train,y_train)
# forest.fit(reduced_X_train,y_train)
forest.oob_score_

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.2s finished


CPU times: user 920 ms, sys: 68.3 ms, total: 989 ms
Wall time: 360 ms


0.9793267216091633

In [238]:
%%time
# y_train_predict = forest.predict_proba(TwoDim_X_train).argmax(axis = 1)
y_train_predict = forest.predict_proba(X_train).argmax(axis = 1)
# y_train_predict = forest.predict_proba(reduced_X_train).argmax(axis = 1)
print ("Logistic Train Accuracy: %.3f" % accuracy_score(y_train,y_train_predict))
y_train_predict

Logistic Train Accuracy: 0.985
CPU times: user 72.6 ms, sys: 17.3 ms, total: 90 ms
Wall time: 110 ms


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished


array([0, 0, 0, ..., 0, 0, 0])

In [239]:
%%time

# y_test_predict = forest.predict_proba(TwoDim_X_test).argmax(axis = 1)
y_test_predict = forest.predict_proba(X_test).argmax(axis = 1)
# y_test_predict = forest.predict_proba(reduced_X_test).argmax(axis = 1)
print("Logistic Test Accuracy: %.3f" % accuracy_score(y_test,y_test_predict))

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished


Logistic Test Accuracy: 0.980
CPU times: user 30.8 ms, sys: 12.4 ms, total: 43.3 ms
Wall time: 110 ms


# deep learning

In [258]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from keras.layers import Conv1D,GlobalMaxPooling1D,MaxPooling1D,GlobalAveragePooling1D,Input,MaxPool1D


## CNN



In [256]:
X_train.shape

(14318, 8)

In [242]:

model = Sequential()
# size = (2, 10)
model = Sequential()
model.add(Conv1D(16, 16,strides=2, activation='relu',input_shape=(14318,8)))


# model.add(Conv1D(filters=64, kernel_size = 4, padding='same',
#                  activation='relu', input_shape=(8,3),name='layer1'))
# model.add(Conv1D(64, 3, activation='relu'))
# model.add(MaxPooling1D(3))
# model.add(Conv1D(128, 3, activation='relu'))
# model.add(Conv1D(128, 3, activation='relu'))
# model.add(GlobalAveragePooling1D())
# model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.summary()


model.summary()

Model: "sequential_65"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_26 (Conv1D)           (None, 7152, 16)          2064      
_________________________________________________________________
dense_43 (Dense)             (None, 7152, 1)           17        
Total params: 2,081
Trainable params: 2,081
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_65"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_26 (Conv1D)           (None, 7152, 16)          2064      
_________________________________________________________________
dense_43 (Dense)             (None, 7152, 1)           17        
Total params: 2,081
Trainable params: 2,081
Non-trainable params: 0
_________________________________________________________________


In [243]:
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(X_train, y_train, batch_size=16, epochs=10)
score = model.evaluate(X_test, y_test, batch_size=16)

ValueError: Error when checking input: expected conv1d_26_input to have 3 dimensions, but got array with shape (14318, 8)

In [129]:
# ann_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
sgd = SGD(lr=0.01, nesterov=True, decay=1e-6, momentum=0.9)
model.compile(loss='categorical_crossentropy',optimizer=sgd,metrics=['accuracy'])

ann_model.fit(X_train, y_train, batch_size=64, epochs=5)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x1a2f09dfd0>

In [130]:
result = ann_model.evaluate(X_test, y_test)



In [131]:
print(model.metrics_names)
print(result)

['loss', 'accuracy']
[0.07597529138932681, 0.9779329895973206]


## ANN

In [179]:
model = Sequential()
model.add(Dense(units=16, activation='relu', input_shape=(8,), name ="layer01"))
model.add(Dropout(0.2))
model.add(Dense(units=16, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(units=4, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))
model.summary()


Model: "sequential_43"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer01 (Dense)              (None, 16)                144       
_________________________________________________________________
dropout_12 (Dropout)         (None, 16)                0         
_________________________________________________________________
dense_30 (Dense)             (None, 16)                272       
_________________________________________________________________
dropout_13 (Dropout)         (None, 16)                0         
_________________________________________________________________
dense_31 (Dense)             (None, 4)                 68        
_________________________________________________________________
dense_32 (Dense)             (None, 1)                 5         
Total params: 489
Trainable params: 489
Non-trainable params: 0
_______________________________________________________

In [180]:
X_train.shape

(14318, 8)

In [181]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=64, epochs=5)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x1a319c6240>

In [182]:
result = model.evaluate(X_test, y_test)



In [106]:
print(model.metrics_names)
print(result)

['loss', 'accuracy']
[0.07366992108648716, 0.9784916043281555]


In [190]:
testing = pd.read_csv("./leaf-classification/train.csv")

In [191]:
testing.head()

Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,1,Acer_Opalus,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0.0,...,0.007812,0.0,0.00293,0.00293,0.035156,0.0,0.0,0.004883,0.0,0.025391
1,2,Pterocarya_Stenoptera,0.005859,0.0,0.03125,0.015625,0.025391,0.001953,0.019531,0.0,...,0.000977,0.0,0.0,0.000977,0.023438,0.0,0.0,0.000977,0.039062,0.022461
2,3,Quercus_Hartwissiana,0.005859,0.009766,0.019531,0.007812,0.003906,0.005859,0.068359,0.0,...,0.1543,0.0,0.005859,0.000977,0.007812,0.0,0.0,0.0,0.020508,0.00293
3,5,Tilia_Tomentosa,0.0,0.003906,0.023438,0.005859,0.021484,0.019531,0.023438,0.0,...,0.0,0.000977,0.0,0.0,0.020508,0.0,0.0,0.017578,0.0,0.047852
4,6,Quercus_Variabilis,0.005859,0.003906,0.048828,0.009766,0.013672,0.015625,0.005859,0.0,...,0.09668,0.0,0.021484,0.0,0.0,0.0,0.0,0.0,0.0,0.03125
