In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Data Handling

In [None]:
# load the csv data file into a pandas dataframe and convert class into numerical data
df = pd.read_csv("data/star_classification.csv")
df["class"]=[0 if i == "GALAXY" else 1 if i == "STAR" else 2 for i in df["class"]]

In [None]:
# display first few values of the data
df.head()

## Dealing with duplicates & irrelevant observations

In [None]:
# remove duplicates and verify
df = df.drop_duplicates()
df[df.duplicated()]

In [None]:
# check for null values
df.isnull().sum()

In [None]:
# create a classifier to detect anomalies in the dataset
clf = LocalOutlierFactor()
y_pred = clf.fit_predict(df)
x_score = clf.negative_outlier_factor_
outlier_score = pd.DataFrame()
outlier_score["score"] = x_score

In [None]:
# create index to filter outlier based on a threshold
filter_outlier = outlier_score["score"] < -1.5
outlier_index = outlier_score[filter_outlier].index.tolist()
len(outlier_index)

In [None]:
# drop outliers from dataframe
df.drop(outlier_index, inplace=True)
df.reset_index(inplace = True)
df.head()

## Feature Selection

In [None]:
plt.figure(figsize = (14,10))
sns.heatmap(df.corr(), annot=True, fmt= ".1f", linewidths= .7)
plt.show()

In [None]:
# check correlation between values and the resulting class
correlation = df.corr()
correlation["class"].sort_values()

In [None]:
# therefore we can drop the columns that do not have a positive/negative (between -0.5 and 0.5) or a NaN correlation with the class column
df.drop(["field_ID", "run_ID", "obj_ID", "alpha", "cam_col", "fiber_ID", "delta", "rerun_ID"], axis=1, inplace=True)

In [None]:
df.head()

## Data Imbalance

In [None]:
# assign x as the df without the class column, and y as the value of the class column.
x = df.drop(["class"], axis = 1)
y = df.loc[:, "class"].values

In [None]:
# correct data imbalances by using SMOTE technique to duplicate values and fit_resample to resample minority classes.
sm = SMOTE(random_state = 39)
print("Original Dataset Shape %s" % Counter(y))
x, y = sm.fit_resample(x, y)
print("Resampled Dataset Shape %s" % Counter(y))

In [None]:
# plot of classes after resampling.
sns.countplot(y)
plt.title("Class")
plt.show()

## PCA

In [None]:
# standardize the 'x' dataset by removing the mean and scaling to unit variance.
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [None]:
# performs (PCA) on the 'x' dataset, selects the top 4 principal components, and combines them with the 'y' dataset, creating a final dataset 'final_DF'.
y = pd.DataFrame(y, columns=["class"])
# pca = PCA(0.95) otherwise causing a bug which 95 percentage takes first 5 columns
pca = PCA(n_components=4)
p_Components = pca.fit_transform(x)
p_DF = pd.DataFrame(data= p_Components, columns= ["Principal Components 1", "Principle Components 2", "Principal Components 3", "Principal Components 4"])
final_DF = pd.concat([p_DF, y[["class"]]], axis=1)
final_DF

## Train & Test Data Split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(p_DF, y, test_size= 0.33, random_state=42)
print("x_train: {}".format(x_train.shape))
print("x_test: {}".format(x_test.shape))
print("y_train: {}".format(y_train.shape))
print("y_test: {}".format(y_test.shape))

# Modelling - RF

Fitting Random Forest with default parameters

n_estimators=100,
criterion='gini',
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_features='auto',
max_leaf_nodes=None,
min_impurity_decrease=0.0,
bootstrap=True,
oob_score=False,
n_jobs=None,
random_state=None,
verbose=0,
warm_start=False,
class_weight=None,
ccp_alpha=0.0,
max_samples=None

In [None]:
rf_model = RandomForestClassifier(random_state=123456)

In [None]:
rf_model.fit(x_train, y_train)

In [None]:
y_predicted = rf_model.predict(x_test)

## Accuracy and report of RF with default parameters

In [None]:
accuracy_score(y_test, y_predicted)

In [None]:
cm = confusion_matrix(y_test, y_predicted)

In [None]:
class_names = ['GALAXY','STAR','QSO']
sns.heatmap(cm, annot=True, fmt='d', xticklabels=class_names, yticklabels=class_names,)
plt.xlabel('Predicted label', fontsize=14)
plt.ylabel('True label', fontsize=14)
plt.title('Confusion Matrix', fontsize=16)
plt.show()

In [None]:
print(classification_report(y_test, y_predicted))

## RF with Hyper-Parameter Optimization

Default RF parameters

n_estimators=100, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None,

In [None]:
param = {
    'n_estimators': [25, 50, 100, 150],
    'criterion' : ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
        }
rf_grid_search = GridSearchCV(estimator = rf_model, param_grid = param, cv=5)

In [None]:
rf_grid_search.fit(x_train, y_train) #KeyboardInterrupted

since the model has taken significant amount of time to train again with tuning, considering randomized iterations below

In [None]:
rf_random_search = RandomizedSearchCV(estimator=rf_model,param_distributions=param, cv=5, n_iter=2,random_state=6)

In [None]:
rf_random_search.fit(x_train, y_train)

In [None]:
rf_random_search.best_score_

In this case Hyper-Parameter Tuning doesn't work and default parameters are considered to be the best

In [None]:
y_predicted_train = rf_model.predict(x_train)

testing_accuracy = accuracy_score(y_test, y_predicted)
training_accuracy = accuracy_score(y_train, y_predicted_train)

print("training accuracy : ",training_accuracy)
print("testing accuracy : ",testing_accuracy)
