In [1]:
import pandas as pd
import numpy as np
from astropy.cosmology import Planck13
from astropy.io import fits
from astropy.table import Table
from tqdm import tqdm
%matplotlib inline
import matplotlib.pyplot as plt

import disperse

In [2]:
cosmo = Planck13
H0 = cosmo.H0.value
Om = cosmo.Om0
Ol = 0.69288
Ok = 0.0
print(f'H0 = {cosmo.H0.value}')
print(f'Om = {cosmo.Om0}')
print(f'Ol = {0.69288}')

H0 = 67.77
Om = 0.30712
Ol = 0.69288


In [3]:
coma_DR12 = pd.read_csv('coma_DR12_filtered_RaDecZ.csv')
sdss_opt_gr_full = pd.read_csv('sdss_opt_gr_filtered_RaDecZ.csv')

min_n_gals = 6
sdss_opt_gr = sdss_opt_gr_full[sdss_opt_gr_full['n_gal'] >= min_n_gals]
sdss_opt_gr.reset_index(drop=True, inplace=True)
sdss_opt_gr['R'] = sdss_opt_gr['R200']
sdss_opt_gr

DPS = disperse.Disperse3D(
    coma_DR12, '_disperse_03/bin/',
    H0, Om, Ol, Ok,
    clusters=sdss_opt_gr
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sdss_opt_gr['R'] = sdss_opt_gr['R200']


In [4]:
DPS.count_cart_coords()

In [5]:
SIGMA = 5.0
SMOOTH = 1
BOARD = 'smooth'
ANGLE = 30

In [6]:
sigmas = [
         0.2, 0.4, 0.6, 0.8,
    1.0, 1.2, 1.4, 1.6, 1.8,
    2.0, 2.2, 2.4, 2.6, 2.8,
    3.0, 3.2, 3.4, 3.6, 3.8, 
    4.0, 4.2, 4.4, 4.6, 4.8, 
    5.0, 5.2, 5.4, 5.6, 5.8, 
    6.0, 6.2, 6.4, 6.6, 6.8,
    7.0, 7.2, 7.4, 7.6, 7.8,
    8.0
]

In [7]:
true_cl_dists = []
false_cl_dists = []
for sigma in tqdm(sigmas):
    DPS = disperse.Disperse3D.read(f'coma_dumps/{sigma}/')
    DPS.gen_random_clusters()
    
    cl_conn, fil_conn, cl_dists \
        = DPS.count_conn([3] * DPS.clusters.shape[0])
    true_cl_dists.append(np.array(cl_dists)[None,:])
    
    cl_conn, fil_conn, cl_dists \
        = DPS.count_conn(
            [3] * DPS.clusters.shape[0], 
            DPS.random_clusters[0]
        )
    false_cl_dists.append(np.array(cl_dists)[None,:])

true_dists = np.concatenate(true_cl_dists, axis=0)
false_dists = np.concatenate(false_cl_dists, axis=0)

  0%|          | 0/40 [00:00<?, ?it/s]
100%|██████████| 1046/1046 [00:00<00:00, 13977.76it/s]

  0%|          | 0/1046 [00:00<?, ?it/s][A

>>> Generate random clusters


100%|██████████| 1046/1046 [00:00<00:00, 13284.65it/s]

100%|██████████| 1046/1046 [00:00<00:00, 13000.35it/s]

100%|██████████| 1046/1046 [00:00<00:00, 13480.95it/s]

100%|██████████| 1046/1046 [00:00<00:00, 13998.54it/s]
100%|██████████| 40/40 [00:35<00:00,  1.11it/s]


In [8]:
true_dists.shape, false_dists.shape

((40, 1046), (40, 1046))

In [9]:
X = np.concatenate((true_dists.T, false_dists.T), axis=0)
Y = np.array([1] * DPS.clusters.shape[0] + [0] * DPS.clusters.shape[0])

In [10]:
X.shape, Y.shape

((2092, 40), (2092,))

In [11]:
sigmas = np.array(sigmas)

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [13]:
with open('true_sigmas.npy', 'rb') as f:
    true_sigmas = np.load(f)
with open('false_sigmas.npy', 'rb') as f:
    false_sigmas = np.load(f)

In [34]:
scores = np.concatenate((true_sigmas[2], false_sigmas[0][2]))

In [35]:
roc_auc_score(Y, scores)

0.9143303817876715

In [16]:
scores = []
for i in range(X.shape[0]):
    reg = LinearRegression(fit_intercept=False).fit(sigmas.reshape(-1, 1), X[i].reshape(-1, 1))
    scores.append(reg.coef_[0][0])
scores = np.array(scores)

In [17]:
roc_auc_score(Y, -scores)

0.8251848981278036

In [18]:
scores = []
for i in range(X.shape[0]):
    reg = LinearRegression(fit_intercept=True).fit(sigmas.reshape(-1, 1), X[i].reshape(-1, 1))
    scores.append(reg.coef_[0][0])
scores = np.array(scores)

In [19]:
roc_auc_score(Y, -scores)

0.6164689118886845

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, stratify=Y, test_size=0.33
)

In [21]:
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, Y_train)
scores = clf.predict_proba(X_test)[:,1].reshape(-1)

In [22]:
roc_auc_score(Y_test, scores)

0.9480103878696491

In [23]:
import pickle
with open('coma_log_reg.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [24]:
scaler = StandardScaler().fit(X_train)

In [25]:
clf = LogisticRegression(random_state=0, max_iter=1000).fit(scaler.transform(X_train), Y_train)
scores = clf.predict_proba(scaler.transform(X_test))[:,1].reshape(-1)

In [26]:
roc_auc_score(Y_test, scores)

0.9514618413336684

In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [28]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(40, 32)
        self.fc2 = nn.Linear(32, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)

        out = F.sigmoid(x)
        return out

In [29]:
torch.manual_seed(0)
model = Net()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(),lr=0.01)
number_of_epochs = 4000
for epoch in range(number_of_epochs):
    Y_pred = model(torch.Tensor(X_train))
    loss = criterion(Y_pred, torch.Tensor(Y_train).view(-1, 1))
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    if (epoch+1)%200 == 0:
        Y_pred = model(torch.Tensor(X_test))
        test_loss = criterion(Y_pred, torch.Tensor(Y_test).view(-1, 1))
        print(f'epoch: {epoch+1}, train_loss={loss.item()}, test_loss={test_loss}')



epoch: 200, train_loss=0.45080432295799255, test_loss=0.44894078373908997
epoch: 400, train_loss=0.4041898250579834, test_loss=0.4051772952079773
epoch: 600, train_loss=0.37194156646728516, test_loss=0.3764004111289978
epoch: 800, train_loss=0.3476908206939697, test_loss=0.3561490774154663
epoch: 1000, train_loss=0.3438173532485962, test_loss=0.3566409945487976
epoch: 1200, train_loss=0.32735317945480347, test_loss=0.34268078207969666
epoch: 1400, train_loss=0.3138446807861328, test_loss=0.33203232288360596
epoch: 1600, train_loss=0.30566471815109253, test_loss=0.3268688917160034
epoch: 1800, train_loss=0.30024221539497375, test_loss=0.3237744867801666
epoch: 2000, train_loss=0.2945757508277893, test_loss=0.3199467360973358
epoch: 2200, train_loss=0.28886693716049194, test_loss=0.31524229049682617
epoch: 2400, train_loss=0.28261449933052063, test_loss=0.3112972676753998
epoch: 2600, train_loss=0.2795086205005646, test_loss=0.3095717132091522
epoch: 2800, train_loss=0.27569580078125, te

In [30]:
scores = model(torch.Tensor(X_test)).view(-1).detach().numpy()

In [31]:
roc_auc_score(Y_test, scores)

0.9444165200636676

In [32]:
clf = RandomForestClassifier(
    max_depth=4, 
    random_state=0,
    n_estimators=500,
    max_features='auto'
).fit(X_train, Y_train)
scores = clf.predict_proba(X_test)[:,1].reshape(-1)

In [33]:
roc_auc_score(Y_test, scores)

0.9532336432939599