In [1]:
%run pylib/fermi_sources-new
fs_data = FermiSources('files/fermi_sources_v2.csv')
#train = DataSet('log_nbb log_epeak pindex curvature category'.split(), 'bll fsrq psr'.split())

Read 6699 source entries from `files/fermi_sources_v2.csv`, selected 6491 with criteria 'delta<0.25 & curvature<1.01'

In [2]:
X,ay = fs_data.getXy(fs_data.mlspec)

In [3]:
y = ay.copy()
for t in range(ay.size):
    if ay[t] == 'bll':
        y[t] = 0
    elif ay[t] == 'fsrq':
        y[t] = 1
    elif ay[t] == 'psr':
        y[t] = 2

make 10 plots (1 for each classifier) with 1 variable for each axis (variables used for training), have the resulting plots have 3 colors for the source classification.


Step 1: Train the program and isolate all the unidentified sources

Step 2: Select 2 variables to plot

Step 3: Be able to have 3 catagories

Step 4: Make scatter plot of sources

Step 5: Test accuracy of sources using different classifiers

In [4]:
from pathlib import Path
import os, sys, glob
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.inspection import DecisionBoundaryDisplay

<h1> Red: BLL | Blue: Psr | Green: FSRQ </h1>

In [9]:
p = Path('new_plots/classifier_results_grids')
p.mkdir(exist_ok=True)

names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    #"Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    #GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=100, max_features=2),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

# the data sets
datasets = [
    (X.iloc[:,:2],y), #log_nbb and log_epeak
    (X.iloc[:,::2],y), #log_nbb and pindex
    (X.iloc[:,::3],y), #log_nbb and curvature
    (X.iloc[:,1:3],y), #epeak and pindex
    (X.iloc[:,1::2],y), #epeak and curvature
    (X.iloc[:,2:4],y) #pindex and curvature
]

figure = plt.figure(figsize=(11, 16))

i = 1
#raw data
for ds_cnt, ds in enumerate(datasets):
    
    # preprocess dataset, split into training and test part
    theX, they = ds
    X_train, X_test, y_train, y_test = train_test_split(
        theX, they, test_size=0.4, random_state=42
    )

    x_min, x_max = theX.iloc[:, 0].min() - 0.5, theX.iloc[:, 0].max() + 0.5
    y_min, y_max = theX.iloc[:, 1].min() - 0.5, theX.iloc[:, 1].max() + 0.5

    # just plot the dataset first
    cm = plt.cm.rainbow
    cm_bright = ListedColormap(["#5F00DB", "#5EDA94", "#DB0000"]) #in order: bll, fsrq, psr
    ax = plt.subplot(len(datasets), 3, i)
    if ds_cnt == 0:
        ax.set_title("Input data")
        
    # Plot the training points
    ax.scatter(X_train.iloc[:, 0], X_train.iloc[:, 1], c=y_train, cmap=cm_bright, edgecolors="k")
    
    # Plot the testing points
    ax.scatter(
        X_test.iloc[:, 0], X_test.iloc[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors="k"
    )
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    ax.set_xticks(())
    ax.set_yticks(())
    i += 1
    
filename = p/'raw.png'
plt.savefig(filename, bbox_inches='tight')


# iterate over classifiers
for name, clf in zip(names, classifiers):
    i = 1
    for ds_cnt, ds in enumerate(datasets):
        theX, they = ds
        X_train, X_test, y_train, y_test = train_test_split(
            theX, they, test_size=0.4, random_state=42
        )
        
        x_min, x_max = theX.iloc[:, 0].min() - 0.5, theX.iloc[:, 0].max() + 0.5
        y_min, y_max = theX.iloc[:, 1].min() - 0.5, theX.iloc[:, 1].max() + 0.5
        
        ax = plt.subplot(len(datasets), 3, i)

        clf = make_pipeline(StandardScaler(), clf)
        clf.fit(X_train, y_train.to_numpy(dtype='int64'))
        score = clf.score(X_test, y_test.to_numpy(dtype='int64'))
        DecisionBoundaryDisplay.from_estimator(
            clf, theX, cmap=cm, alpha=0.8, ax=ax, eps=0.5
        )

        # Plot the training points
        ax.scatter(
            X_train.iloc[:, 0], X_train.iloc[:, 1], c=y_train, cmap=cm_bright, edgecolors="k"
        )
        # Plot the testing points
        ax.scatter(
            X_test.iloc[:, 0],
            X_test.iloc[:, 1],
            c=y_test,
            cmap=cm_bright,
            edgecolors="k",
            alpha=0.6,
        )

        ax.set_xlim(x_min, x_max)
        ax.set_ylim(y_min, y_max)
        ax.set_xticks(())
        ax.set_yticks(())
        if ds_cnt == 0:
            ax.set_title(name)
        ax.text(
            x_max - 0.3,
            y_min + 0.3,
            ("%.2f" % score).lstrip("0"),
            size=15,
            horizontalalignment="right",
        )
        i += 1
        
    filename = p/f'{name}'
    plt.tight_layout()
    plt.savefig(f'{filename}_plt.png', bbox_inches='tight')
        
    figure.clear()
    print(name)
    


Nearest Neighbors
Linear SVM
RBF SVM
Decision Tree
Random Forest
Neural Net
AdaBoost
Naive Bayes
QDA


<Figure size 1100x1600 with 0 Axes>

<h2>With epeak</h2>

AdaBoost: 78.66%

Decision Tree: 71.17%

Gaussian Process: 80.5%

Linear SVM: 75.83%

Naive Bayes: 76.66%

Nearest Neighbor: 75.33%

Neural Net: 79.17%

QDA: 77.5%

Random Forest: 74.67%

RBF SVM: 79%

AdaBoost: 78.66%

Decision Tree: 72.5%

Gaussian Process: 80.5%

Linear SVM: 79.33%

Naive Bayes: 79.16%

Nearest Neighbor: 77%

Neural Net: 80.66%

QDA: 79.5%

Random Forest: 76%

RBF SVM: 80.83%

In [5]:
unid =  FermiSources('files/fermi_sources_v2.csv')
specs = MLspec(features=('log_nbb','log_epeak','pindex','curvature'), target='association', target_names=('unid'))
X_te = unid.getXy(specs)[0]

Read 6699 source entries from `files/fermi_sources_v2.csv`, selected 6491 with criteria 'delta<0.25 & curvature<1.01'

In [None]:
p = Path('new_plots/single_sources_plots')
p.mkdir(exist_ok=True)

names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    #"Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    #GaussianProcessClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=100, max_features=2),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

# the data sets
datasets = [
    (X.iloc[:,:2],y), #log_nbb and log_epeak
    (X.iloc[:,::2],y), #log_nbb and pindex
    (X.iloc[:,::3],y), #log_nbb and curvature
    (X.iloc[:,1:3],y), #epeak and pindex
    (X.iloc[:,1::2],y), #epeak and curvature
    (X.iloc[:,2:4],y) #pindex and curvature
]

testval = [
    (X_te.iloc[:,:2],y), #log_nbb and log_epeak
    (X_te.iloc[:,::2],y), #log_nbb and pindex
    (X_te.iloc[:,::3],y), #log_nbb and curvature
    (X_te.iloc[:,1:3],y), #epeak and pindex
    (X_te.iloc[:,1::2],y), #epeak and curvature
    (X_te.iloc[:,2:4],y) #pindex and curvature
]

figure = plt.figure(figsize=(22, 32)) #grid size:11,16 | single size

# iterate over classifiers
for name, clf in zip(names, classifiers):
    i = 1
    j = 0
    
    dirpath = os.path.join(p, name)
    #try:
    #    os.mkdir(dirpath)
    #except FileExistsError:
    #    print('Directory {} already exists'.format(dirpath))
    #else:
    #    print('Directory {} created'.format(dirpath))
    newp = Path(dirpath)

    for ds_cnt, ds in enumerate(datasets):
        theX, they = ds
        model = clf
        model.fit(X,ay)
        
        Xnew = testval[j][0]
        ynew = model.predict(X_te)
        
        x_min, x_max = theX.iloc[:, 0].min() - 0.5, theX.iloc[:, 0].max() + 0.5
        y_min, y_max = theX.iloc[:, 1].min() - 0.5, theX.iloc[:, 1].max() + 0.5

        cm = plt.cm.rainbow
        cm_bright = ListedColormap(["#5F00DB", "#5EDA94", "#DB0000"]) #in order: bll, fsrq, psr
        
        ax = plt.subplot(len(datasets), 3, i)

        clf = make_pipeline(StandardScaler(), clf)
        clf.fit(theX, they.to_numpy(dtype='int64'))
        #score = clf.score(Xnew, ynew)
        DecisionBoundaryDisplay.from_estimator(
            clf, theX, cmap=cm, alpha=0.8, ax=ax, eps=0.5
        )
        
        #Plot the training points
        #ax.scatter(
        #   theX.iloc[:, 0], theX.iloc[:, 1], c=they, cmap=cm_bright, edgecolors="k", alpha=0.6,
        #)
        # Plot the testing points
        
        ycopy = ynew.copy()
        for t in range(ynew.size):
            if ynew[t] == 'bll':
                ycopy[t] = 0
            elif ynew[t] == 'fsrq':
                ycopy[t] = 1
            elif ynew[t] == 'psr':
                ycopy[t] = 2
        ax.scatter(
            Xnew.iloc[:, 0],
            Xnew.iloc[:, 1],
            c=ycopy.astype(int),
            marker = '*',
            cmap=cm_bright,
            edgecolors="k",
            alpha=0.8,
        )

        ax.set_xlim(x_min, x_max)
        ax.set_ylim(y_min, y_max)
        ax.set_xticks(())
        ax.set_yticks(())
        if ds_cnt == 0:
            ax.set_title(name)
        #ax.text(
        #    x_max - 0.3,
        #    y_min + 0.3,
        #    ("%.2f" % score).lstrip("0"),
        #    size=15,
        #    horizontalalignment="right",
        #)
        
       
        
        
        filename = newp/f'{name}'
        plt.tight_layout()
        plt.savefig(f'{filename}_plt_{i}.png', bbox_inches='tight')
        i += 1
        j += 1
        figure.clear()
        
    print(name)
    


Nearest Neighbors
Linear SVM


In [30]:
df = pd.read_csv("files/fermi_sources_v2.csv")

sns.scatterplot(df, x=theX.iloc[:, 0], y=theX.iloc[:, 1], hue=they, 
                style='plex', s=100, alpha=0.6)

ValueError: Could not interpret value `plex` for parameter `style`

In [13]:
ycopy.astype(int)

array([0, 2, 2, ..., 1, 0, 1])