In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from IPython.display import clear_output
from six.moves import urllib
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn import decomposition
from sklearn import linear_model
import seaborn as sns

print("Imports Ok")

Imports Ok


In [2]:
# Import Data
cols = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]
df = pd.read_csv("magic04.data", names=cols)

In [3]:
# Exclude useless data
#df = df.drop(columns=[ "fSize", "fConc", "fConc1", "fDist", "fM3Trans"])

In [4]:
def init_features(dataframe):
    for column in dataframe.columns:
        if not pd.api.types.is_numeric_dtype(dataframe[column]):
            voc = dataframe[column].unique()
            voc.sort()
            mapping = {val: i for i, val in enumerate(voc)}
            dataframe[column] = dataframe[column].map(mapping)
                

init_features(df)

In [5]:
#plot data
def plotfeatclass(dataframe, classname):
    for cl in classname:
        for label in df.columns[:-1]:
            fig, (ax1, ax2) = plt.subplots(1, 2)
            ax1.hist(df[df[cl] == 1][label], color='blue', label='gamma')
            ax2.hist(df[df[cl] == 0][label], color='red', label='hadron')
            plt.title(label)
            plt.show()

#plotfeatclass(df, ["class"])

In [6]:

def ScaleDataframe(x_frame, ranges=(0,1)):
    numeric = x_frame.select_dtypes(include=[np.number])
    not_binary_cols = [col for col in numeric.columns if not set(numeric[col].dropna().unique()).issubset({0, 1})]
    binary = numeric.drop(columns=not_binary_cols)
    
    scaler = MinMaxScaler(feature_range=ranges)
    scaled = scaler.fit_transform(numeric[not_binary_cols])
    scaled_df = pd.DataFrame(scaled, columns=not_binary_cols, index=x_frame.index)
    
    final_df = pd.concat([scaled_df, binary], axis=1)

    return final_df


df_scaled = ScaleDataframe(df,(0,1))
df_scaled.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,0.074306,0.062415,0.208043,0.43039,0.293229,0.470032,0.620576,0.512493,0.445467,0.163066,0
1,0.082815,0.045727,0.170668,0.587794,0.558601,0.468649,0.623756,0.507944,0.070677,0.412679,0
2,0.478241,0.530579,0.626818,0.027617,0.027263,0.556215,0.468201,0.41654,0.855111,0.516926,0
3,0.059212,0.037338,0.117445,0.683714,0.580679,0.469558,0.57063,0.515219,0.1161,0.233582,0
4,0.214774,0.120603,0.360674,0.345153,0.271003,0.43787,0.63205,0.590373,0.051644,0.718582,0


In [7]:
# Split Data
train, temp = train_test_split(df_scaled, test_size=0.4, random_state=42)
valid, test = train_test_split(temp, test_size=0.5, random_state=42)

def splitdata(dataframe, Oversampling=False):
    X = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values

    if Oversampling:
        ros = RandomOverSampler()
        X, y = ros.fit_resample(X, y)

    data = np.hstack((X, np.reshape(y, (-1, 1))))

    return data, X, y

train, X_train, y_train = splitdata(train, True)
valid, X_valid, y_valid = splitdata(valid)
test, X_test, y_test = splitdata(test)

In [8]:
pca = decomposition.PCA(n_components=10)
pca_data = pca.fit_transform(X_train)

knn_model = KNeighborsClassifier(n_neighbors=55, algorithm='auto', weights='distance', p = 1 )
knn_model.fit(X_train, y_train)
print(f"Accuracy : {knn_model.score(X_test, y_test)}")

Accuracy : 0.852260778128286


In [9]:
# logistic Regression
log_model = linear_model.LogisticRegression()
log_model.fit(X_train, y_train)
print(f"Accuracy : {log_model.score(X_test, y_test)}")

Accuracy : 0.7941640378548895


In [10]:
# SVM model
from sklearn.svm import SVC
svm_model = SVC()
svm_model.fit(X_train, y_train)
print(f"Accuracy : {svm_model.score(X_test, y_test)}")

Accuracy : 0.8622502628811777


In [31]:
# NN model
import tensorflow as tf

nn_model = tf.keras.Sequential([
    tf.keras.layers.Dense(64,activation='relu',input_shape=(len(pd.array(X_train[-1,:])),)),
    tf.keras.layers.Dense(32,activation='relu'),
    tf.keras.layers.Dense(32,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])

nn_model.compile(optimizer=tf.keras.optimizers.Adam(0.002), loss='binary_crossentropy', metrics=['accuracy'])

history = nn_model.fit(X_train, y_train, epochs=120, validation_split=0.5)

Epoch 1/120
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7237 - loss: 0.5500 - val_accuracy: 0.7158 - val_loss: 0.5335
Epoch 2/120
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8202 - loss: 0.4049 - val_accuracy: 0.6601 - val_loss: 0.6639
Epoch 3/120
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8183 - loss: 0.4068 - val_accuracy: 0.7980 - val_loss: 0.4232
Epoch 4/120
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8396 - loss: 0.3680 - val_accuracy: 0.7009 - val_loss: 0.5971
Epoch 5/120
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8353 - loss: 0.3712 - val_accuracy: 0.7938 - val_loss: 0.4394
Epoch 6/120
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8396 - loss: 0.3635 - val_accuracy: 0.8106 - val_loss: 0.4100
Epoch 7/120
[1m232/23

In [32]:
loss, acc = nn_model.evaluate(X_test, y_test, verbose=1)

[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 856us/step - accuracy: 0.8783 - loss: 0.3183
