In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn.ensemble import VotingClassifier

# Data Handling

In [26]:
# load the csv data file into a pandas dataframe and convert class into numerical data
df = pd.read_csv("data/star_classification.csv")
df["class"]=[0 if i == "GALAXY" else 1 if i == "STAR" else 2 for i in df["class"]]

In [27]:
# display first few values of the data
df.head()

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
0,1.237661e+18,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,3606,301,2,79,6.543777e+18,0,0.634794,5812,56354,171
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014e+19,0,0.779136,10445,58158,427
2,1.237661e+18,142.18879,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.1522e+18,0,0.644195,4576,55592,299
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,4192,301,3,214,1.030107e+19,0,0.932346,9149,58039,775
4,1.23768e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891865e+18,0,0.116123,6121,56187,842


## Dealing with duplicates & irrelevant observations

In [28]:
# remove duplicates and verify
df = df.drop_duplicates()
df[df.duplicated()]

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID


In [29]:
# check for null values
df.isnull().sum()

obj_ID         0
alpha          0
delta          0
u              0
g              0
r              0
i              0
z              0
run_ID         0
rerun_ID       0
cam_col        0
field_ID       0
spec_obj_ID    0
class          0
redshift       0
plate          0
MJD            0
fiber_ID       0
dtype: int64

In [30]:
# create a classifier to detect anomalies in the dataset
clf = LocalOutlierFactor()
y_pred = clf.fit_predict(df)
x_score = clf.negative_outlier_factor_
outlier_score = pd.DataFrame()
outlier_score["score"] = x_score

In [31]:
# create index to filter outlier based on a threshold
filter_outlier = outlier_score["score"] < -1.5
outlier_index = outlier_score[filter_outlier].index.tolist()
len(outlier_index)

15256

In [32]:
# drop outliers from dataframe
df.drop(outlier_index, inplace=True)
# df.reset_index(inplace = True)
df.head()

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
0,1.237661e+18,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,3606,301,2,79,6.543777e+18,0,0.634794,5812,56354,171
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014e+19,0,0.779136,10445,58158,427
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,4192,301,3,214,1.030107e+19,0,0.932346,9149,58039,775
4,1.23768e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891865e+18,0,0.116123,6121,56187,842
6,1.237679e+18,23.234926,11.418188,21.46973,21.17624,20.92829,20.60826,20.42573,7773,301,2,462,1.246262e+19,2,0.586455,11069,58456,113


## Feature Selection

In [None]:
plt.figure(figsize = (14,10))
sns.heatmap(df.corr(), annot=True, fmt= ".1f", linewidths= .7)
plt.show()

In [None]:
# check correlation between values and the resulting class
correlation = df.corr()
correlation["class"].sort_values()

In [None]:
# therefore we can drop the columns that do not have a positive/negative (between -0.5 and 0.5) or a NaN correlation with the class column
df.drop(["field_ID", "run_ID", "obj_ID", "alpha", "cam_col", "fiber_ID", "delta", "rerun_ID"], axis=1, inplace=True)

In [None]:
df.head()

## Data Imbalance

In [None]:
# assign x as the df without the class column, and y as the value of the class column.
x = df.drop(["class"], axis = 1)
y = df.loc[:, "class"].values

In [None]:
# correct data imbalances by using SMOTE technique to duplicate values and fit_resample to resample minority classes.
sm = SMOTE(random_state = 39)
print("Original Dataset Shape %s" % Counter(y))
x, y = sm.fit_resample(x, y)
print("Resampled Dataset Shape %s" % Counter(y))

In [None]:
# plot of classes after resampling.
sns.countplot(y)
plt.title("Class")
plt.show()

## PCA

In [None]:
# standardize the 'x' dataset by removing the mean and scaling to unit variance.
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [None]:
# performs (PCA) on the 'x' dataset, selects the top 4 principal components, and combines them with the 'y' dataset, creating a final dataset 'final_DF'.
y = pd.DataFrame(y, columns=["class"])
pca = PCA(0.95)
p_Components = pca.fit_transform(x)
p_DF = pd.DataFrame(data= p_Components, columns= ["Principal Components 1", "Principle Components 2", "Principal Components 3", "Principal Components 4"])
final_DF = pd.concat([p_DF, y[["class"]]], axis=1)
final_DF

## Train & Test Data Split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(p_DF, y, test_size= 0.25, random_state=42)
print("x_train: {}".format(x_train.shape))
print("x_test: {}".format(x_test.shape))
print("y_train: {}".format(y_train.shape))
print("y_test: {}".format(y_test.shape))

# Modelling

## Logistic Regression Model

In [None]:
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs')
lr.fit(x_train, y_train)

In [None]:
y_predicted = lr.predict(x_test)
y_predicted

In [None]:
# Evaluate Model
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_predicted)
print(cm)

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_predicted))

## Support Vector Machine (SVM) Model

In [None]:
svm = SVC(kernel="rbf", C=4)
svm.fit(x_train, y_train)

KeyboardInterrupt: 

In [None]:
svm_pred = svm.predict(x_test)
accuracy_score(svm_pred, y_test)

## Random Forest Model

# Ensemble Model

In [None]:
kfold = model_selection.KFold(n_splits=5, shuffle=True)
estimators = []
estimators.append(("logistic", lr))
estimators.append(("svm", svm))

In [None]:
ensemble_model = VotingClassifier(estimators, voting='hard')
results = model_selection.cross_val_score(ensemble_model, x_train, y_train, cv=kfold, n_jobs=1)

In [None]:
ENSM_score = np.mean(results)
print(f"Accuracy: {ENSM_score}")