In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import scipy as sc
from imblearn.over_sampling import RandomOverSampler

## Loading Dataset:
Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.

Donated by:
P. Savicky
Institute of Computer Science, AS of CR
Czech Republic
savicky '@' cs.cas.cz

In [41]:
cols = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]
df = pd.read_csv("./magic+gamma+telescope/magic04.data", names=cols)
df[df["class"]!='g'].head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
12332,93.7035,37.9432,3.1454,0.168,0.1011,53.2566,89.0566,11.8175,14.1224,231.9028,h
12333,102.0005,22.0017,3.3161,0.1064,0.0724,-54.0862,43.0553,-15.0647,88.4636,274.9392,h
12334,100.2775,21.8784,3.11,0.312,0.1446,-48.1834,57.6547,-9.6341,20.7848,346.433,h
12335,91.6558,18.8293,2.7097,0.4386,0.342,-52.6841,-97.8373,-17.0955,63.8834,130.7998,h
12336,38.0195,12.6736,2.8747,0.4084,0.1928,-51.484,8.3471,7.962,24.5402,163.8674,h


In [42]:
df["class"] = (df["class"] == 'g').astype(int)

## Exploring Dataset:

In [22]:
for label in cols[:-1]:
    plt.hist(df[df["class"]==1][label], bins=100, label='gamma', color='blue', alpha=0.7, density=True)
    plt.hist(df[df["class"]==0][label], bins=100, label='hadron', color='red', alpha=0.7, density=True)
    plt.title(label)
    plt.ylabel("Probability")
    plt.xlabel(label)
    plt.legend()
    plt.show()

## Train, Validation and Test Datasets

In [52]:
train, validation, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])

In [53]:
def scale_dataset(dataframe, oversample=False):
    X = dataframe.iloc[:,:-1].values
    y = dataframe.iloc[:,-1].values
    
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    if oversample:
        ros = RandomOverSampler()
        X, y = ros.fit_resample(X, y)
        
    
    data = np.hstack((X,np.reshape(y, (-1,1))))
    
    return data, X, y
    

In [54]:
train, x_train, y_train = scale_dataset(train, True)
validation, x_validation, y_validation = scale_dataset(validation, False)
test, x_test, y_test = scale_dataset(test, False)