# CLASSIFICATION


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

### dataset:
Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.

Donated by: P. Savicky Institute of Computer Science, AS of CR Czech Republic savicky '@' cs.cas.cz

In [None]:
cols = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"] #name for the columns of the
df = pd.read_csv("magic04.data", names=cols)
df.head()

each entity in the dataset is a data point .

each data point has a label (the cols) and a class

In [None]:
#df["class"].unique()
#data labled g (gamma particles) -> 1 and h (hadron particals) -> 0
df["class"] = (df["class"] == "g").astype(int)

In [None]:
#using ONE HOT ENCODING
df.head()

now ploting the data in the form fo histogram (lables)



In [None]:
for label in cols[:-1]:
  plt.hist(df[df["class"]==1][label], color='blue', label='gamma', alpha=0.7, density=True)
  plt.hist(df[df["class"]==0][label], color='red', label='hadron', alpha=0.7, density=True)
  plt.title(label)
  plt.ylabel("Probability")
  plt.xlabel(label)
  plt.legend()
  plt.show()



###Train, validation, test datasets


In [None]:
train, vaild, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])

In [None]:
def scale_dataset(dataframe, oversample=False):
  x = dataframe[dataframe.columns[:-1]].values
  y = dataframe[dataframe.columns[-1]].values

  scaler = StandardScaler()
  X = scaler.fit_transform(x)

  if oversample:
    ros = RandomOverSampler()
    X, y = ros.fit_resample(X, y)

  data = np.hstack((X, np.reshape(y, (-1, 1))))

  return data, X, y


items inthe datasets train



In [None]:
print(len(train[train["class"]==1])) #gammas
print(len(train[train["class"]==0])) #hadrons

In [None]:
train, X_train, y_train = scale_dataset(train, oversample=True)
vaild, X_vaild, y_vaild = scale_dataset(vaild, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)

### KNN model
-the new point will take the label of the majority around it.


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

In [None]:
y_pred = knn_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

###naive bayes
-uses condational probability

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb_model = GaussianNB()
nb_model = nb_model.fit(X_train, y_train)

In [None]:
y_pred = nb_model.predict(X_train)
print(classification_report(y_train, y_pred))

###logistic regression
-map the given data in terms of sigmoidal function

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train, y_train)

In [None]:
y_pred = lg_model.predict(X_test)
print(classification_report(y_test, y_pred))

###Support Vector Machine
-dividing the data into two parts using a hyperplane



In [None]:
from sklearn.svm import SVC

In [None]:
svm_model = SVC()
svm_model = svm_model.fit(X_train, y_train)

In [None]:
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

# neural network
-tnsorflow



In [None]:
import tensorflow as tf


-loss function and accuracy plot

In [None]:
def plot_history(history):
  fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4))
  ax1.plot(history.history["loss"], label = 'accuracy')
  ax1.plot(history.history['val_loss'], label = 'val_loss')
  ax1.set_xlabel('epoch')
  ax1.set_ylabel('binary crossentropy')
  ax1.grid(True)

  ax2.plot(history.history["accuracy"], label = 'accuracy')
  ax2.plot(history.history['val_accuracy'], label = 'val_accuracy')
  ax2.set_xlabel('epoch')
  ax2.set_ylabel('accuracy')
  ax2.grid(True)

  plt.show()

-neural network model defining

In [None]:
def train_model(X_train, y_train, num_nodes, dropout_prob, lr, batch_size, epochs):
  nn_model = tf.keras.Sequential([
      tf.keras.layers.Dense(num_nodes, activation='relu', input_shape=(10,)),
      tf.keras.layers.Dropout(dropout_prob),
      tf.keras.layers.Dense(num_nodes, activation='relu'),
      tf.keras.layers.Dropout(dropout_prob),
      tf.keras.layers.Dense(1, activation= 'sigmoid')
  ])

  nn_model.compile(optimizer=tf.keras.optimizers.Adam(lr),
                   loss='binary_crossentropy',
                   metrics = ['accuracy'])
  history = nn_model.fit(
      X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=0
  )

  return nn_model, history

In [None]:
least_val_loss = float('inf')
least_loss_model = None
epochs = 100
for num_nodes in [16,32,64]:
  for dropout_prob in [0,0.2]:
    for lr in [0.01, 0.005, 0.001]:
      for batch_size in [32,64,128]:
        print(f"{num_nodes} nodes, dropout {dropout_prob}, lr {lr}, batch size {batch_size} ")
        model, history = train_model(X_train=X_train, y_train=y_train, num_nodes=num_nodes, dropout_prob=dropout_prob, lr=lr, batch_size=batch_size, epochs=epochs )
        plot_history(history)
        val_loss = model.evaluate(X_vaild, y_vaild)[0]
        if val_loss < least_val_loss:
          least_val_loss = val_loss
          least_loss_model = model

In [None]:
y_pred = least_loss_model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int).reshape(-1,)

In [None]:
print(classification_report(y_test, y_pred))