<a href="https://colab.research.google.com/github/TanishaSharma01/MAGIC-gamma-hadron-classification/blob/main/MAGIC__Telescope_ML_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

## Dataset:
Bock, R. (2004). MAGIC Gamma Telescope [Dataset]. UCI Machine Learning Repository. https://doi.org/10.24432/C52C8B.

In [None]:
# labels for the columns
cols = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans",
        "fAlpha", "fDist", "class"]

# read the data from our dataset
df = pd.read_csv("magic04.data", names=cols)

# first 5 rows
df.head()

In [None]:
# unique data in column "class"
df["class"].unique()

# converting the value for "g" to int
# 'g' is for gamma and 'h' ias for hadron
df["class"] = (df["class"] == "g").astype(int)

In [None]:
df.head()

In [None]:
for label in cols[:-1]:
  plt.hist(df[df["class"]==1][label], color='blue', label='gamma', alpha=0.7, density=True)
  plt.hist(df[df["class"]==0][label], color='red', label='hadron', alpha=0.7, density=True)
  plt.title(label)
  plt.ylabel("Probability")
  plt.xlabel(label)
  plt.legend()
  plt.show()

# Train, validation and Test Data Sets

In [None]:
# splitting the dataset such that 60% of the data is for training
# 0.6 marks the end of the training dataset
# 0.8 marks the end of the validation
train, valid, test = [pd.DataFrame(x, columns=df.columns) for x in np.split(df.sample(frac=1).values,
                                                                            [int(0.6 * len(df)),
                                                                             int(0.8 * len(df))])]

In [None]:
def scale_dataset(dataframe, oversample = False):
  # labels that would be our inputs, that is we get all the columns except the last
  X = dataframe[dataframe.columns[:-1]].values

  # label that is the output/target that is the last column
  y = dataframe[dataframe.columns[-1]].values

  # normalizes the feature matrix X
  # fit() - Calculates the mean and standard deviation for each feature column
  # transform() - Applies the standardization using those calculated statistics
  scaler = StandardScaler()
  X =  scaler.fit_transform(X)

  if oversample:
    ros = RandomOverSampler()
    X, y = ros.fit_resample(X, y)

  # horizontally stack them together
  # here X is 2-D but y is 1-D
  # we reshape y to be a column vector
  data = np.hstack((X, np.reshape(y, (-1, 1))))

  return data, X, y

In [None]:
# gamma values are disproportionately more than hadron
print(len(train[train["class"] == 1])) #gamma
print(len(train[train["class"] == 0])) #hadron

In [None]:
# to evenly rebalance number of gammas and hadrons
train, X_train, y_train = scale_dataset(train, oversample = True)
valid, X_valid, y_valid = scale_dataset(valid, oversample = False)
test, X_test, y_test = scale_dataset(test, oversample = False)

print(len(y_train))
print(sum(y_train == 1))
print(sum(y_train == 0))

# kNN: K nearest neighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=11)
knn_model.fit(X_train, y_train)

In [None]:
y_pred = knn_model.predict(X_test)

Precision: Out of all of the values we predicted, how many of our predicted values are right

Recall: Out of all the values of a specific classification type, how many of our predicted values are right

F1-score: combination of two

Accuracy: how accurate our model is

In [None]:
print(classification_report(y_test, y_pred))

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb_model = GaussianNB()
nb_model = nb_model.fit(X_train, y_train)

In [None]:
y_pred = nb_model.predict(X_test)
print(classification_report(y_test, y_pred))

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train, y_train)

In [None]:
y_pred = lg_model.predict(X_test)
print(classification_report(y_test, y_pred))

# Support Vector Machine (SVM)

In [None]:
from sklearn.svm import SVC

In [None]:
svm_model = SVC()
svm_model = svm_model.fit(X_train, y_train)

In [None]:
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# Neural Net

In [None]:
import tensorflow as tf