## Magic Gamma Telescope Dataset

In [1]:
# importing libraries
import pandas as pd
import numpy as np

In [2]:
# importing libraries for plotting
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# importing libraries for normalizing data
from sklearn import neighbors
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [None]:
# importing libraries for classification report
from sklearn import metrics 
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix

## Dataset

In [None]:
# accessing the data

# Creating a list of features
cols = ["fLength", "fWidth", "fSzie", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]

# Adding the attributes as an argument for pandas to export into the data frame
tel_df = pd.read_csv("./data/magic04.data", names = cols)

In [None]:
# Displays the first five rows of data
tel_df.head()

In [None]:
print(tel_df.to_string())

In [None]:
tel_df["class"].unique()

In [None]:
# displaying the number og gamma instances
print(len(tel_df[tel_df["class"] == "g"]))

In [None]:
# displaying the number og hadron instances
print(len(tel_df[tel_df["class"] == "h"]))

In [None]:
# changing g and h to 1 and 0
tel_df["class"] = (tel_df["class"] == "g").astype(int)

In [None]:
tel_df.head()

In [None]:
labels = ["Gamma", "Hadron"]
tel_df["class"].value_counts().plot(kind = "pie", autopct = "%1.2f%%", labels = labels)
plt.title("Pie distribution of Gamma and Hadron")
plt.show()

In [None]:
# checking for duplicates
tel_df_duplicates = tel_df.duplicated()

In [None]:
# printing duplicates
print(tel_df[tel_df_duplicates])

In [None]:
# removing duplicates
tel_df = tel_df.drop_duplicates()

In [None]:
tel_df.shape

In [None]:
for label in cols[:-1]:
    plt.hist(tel_df[tel_df["class"] == 1][label], color = "blue", label = "gamma", alpha = 0.7, density = True)
    plt.hist(tel_df[tel_df["class"] == 0][label], color = "purple", label = "hadron", alpha = 0.7, density = True)
    plt.title(label)
    plt.ylabel("Probability")
    plt.xlabel(label)
    plt.legend()
    plt.show()
    

In [None]:
# splitting data into x and y plane
x = tel_df.drop(["class"], axis = 1)    # feature matrix
y = tel_df["class"] # target vector

In [None]:
print(x.shape) 
print(y.shape)

## Normalizing the Dataset

In [None]:
# create a MInMaxScaler object
scaler = MinMaxScaler()

# Fit the scaler to the data and transform the data
normalized_data = scaler.fit_transform(tel_df)

In [None]:
tel_df = normalized_data  

## Random Undersampling

In [None]:
rus = RandomUnderSampler()
x, y = rus.fit_resample(x, y)
ax = y.value_counts().plot.pie(autopct = '%.2f')
_ = ax.set_title("Under-sampling")

In [None]:
y.value_counts()

## Random Oversampling

In [None]:
ros = RandomOverSampler()
x, y = ros.fit_resample(x, y)
ax = y.value_counts().plot.pie(autopct = '%.2f')
_ = ax.set_title("Over-sampling")

In [None]:
y.value_counts()

## Train and test datasets

In [None]:
# splitting our dataset into train and test datasets
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.7, test_size = 0.3, random_state = 42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

## Training the model

## K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_model = KNeighborsClassifier(n_neighbors = 3)
knn_model.fit(x_train, y_train)

In [None]:
y_pred = knn_model.predict(x_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
# calculate accuracy score
knn_model_accuracy_score = accuracy_score(y_test, y_pred)
print(f"Accuracy of KNN Model:{knn_model_accuracy_score * 100:.0f}%\n")

# calculate precision score
knn_model_precision_score = precision_score(y_test, y_pred)
print(f"Precision of KNN Model:{knn_model_precision_score * 100:.0f}%\n")

# calculate recall score
knn_model_recall_score = recall_score(y_test, y_pred)
print(f"Recall of KNN Model:{knn_model_recall_score * 100:.0f}%\n")

# calculate accuracy score
knn_model_f1_score = f1_score(y_test, y_pred)
print(f"F1 Score of KNN Model:{knn_model_f1_score * 100:.0f}%")

## Support Vector Machine Classifier

In [None]:
from sklearn.svm import SVC

In [None]:
# training the SVM model
svm_model = SVC()
svm_model.fit(x_train, y_train)

In [None]:
y_pred = svm_model.predict(x_test)

In [None]:
print(classification_report(y_test, y_pred))

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier()
rf_model = rf_model.fit(x_train, y_train)

In [None]:
y_pred = rf_model.predict(x_test)

In [None]:
print(classification_report(y_test, y_pred))

## Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt_model = DecisionTreeClassifier()
dt_model = dt_model.fit(x_train, y_train)

In [None]:
y_pred = dt_model.predict(x_test)

In [None]:
print(classification_report(y_test, y_pred))

## Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb_model = GaussianNB()
nb_model = nb_model.fit(x_train, y_train)

In [None]:
y_pred = nb_model.predict(x_test)

In [None]:
print(classification_report(y_test, y_pred))

## Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gb_model = GradientBoostingClassifier()
gb_model = gb_model.fit(x_train, y_train)

In [None]:
y_pred = gb_model.predict(x_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
# lg_cm = metrics.confusion_matrix(y_test, y_pred)
# cmap = "prism"
# f, ax = plt.subplots(figsize = (5,5))
# sns.heatmap(lg_cm, annot = True, linewidth = 1, fmt = ".0f", ax = ax, cmap = cmap)
# plt.title("Confusion Matrix of Logistic Regression\n")
# plt.xlabel("Predicted label")
# plt.ylabel("True label")
# plt.show()