In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

# Data Prep

In [2]:
df_train = pd.read_csv('D://datasets/sky_server/Skyserver_04_24_2023 12_30_26 PM.csv', low_memory=False)
df_train.shape

(100000, 18)

In [3]:
df_train

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,camcol,field,specobjid,class,redshift,plate,mjd,fiberid
0,1.237680e+18,342.697634,11.107615,19.55744,17.46571,16.44621,15.97754,15.57301,7773,301,2,197,5.685810e+18,GALAXY,0.082067,5050,56215,66
1,1.237680e+18,7.273359,12.007943,19.51252,17.58586,16.59778,16.18047,15.75792,7773,301,2,358,6.969350e+18,GALAXY,0.092747,6190,56210,100
2,1.237660e+18,347.324134,0.064354,18.22867,16.31122,15.43749,14.96799,14.55907,4263,301,4,31,7.635110e+17,GALAXY,0.015748,678,52884,548
3,1.237680e+18,327.676605,9.454218,18.80359,16.91130,16.02542,15.61801,15.29591,7773,301,2,97,4.610700e+18,GALAXY,0.071726,4095,55497,492
4,1.237660e+18,173.838263,44.396655,19.56951,18.29280,17.76531,17.58614,17.52179,3813,301,3,174,3.619790e+18,STAR,-0.000070,3215,54861,74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1.237660e+18,157.401770,55.280518,18.85913,17.81282,17.39851,17.21817,17.17015,2735,301,6,83,3.577010e+18,STAR,-0.000221,3177,54833,107
99996,1.237660e+18,165.798849,56.876348,18.37093,17.50092,17.22220,17.11626,17.08536,2735,301,6,116,7.996220e+18,STAR,-0.000739,7102,56666,284
99997,1.237670e+18,37.736250,25.176344,18.10809,16.26831,15.53559,15.24569,15.07602,5817,301,4,120,2.678560e+18,STAR,-0.000072,2379,53762,177
99998,1.237670e+18,205.885357,0.729868,17.48545,16.50522,16.26882,16.18110,16.15477,6174,301,4,97,3.368010e+17,STAR,-0.000338,299,51671,572


In [4]:
df_train['class'].value_counts()

GALAXY    51240
STAR      38160
QSO       10600
Name: class, dtype: int64

In [5]:
df_train.drop(['run', 'rerun', 'camcol', 'field', 'objid', 'specobjid', 'fiberid'], axis = 1, inplace = True)

df_train

Unnamed: 0,ra,dec,u,g,r,i,z,class,redshift,plate,mjd
0,342.697634,11.107615,19.55744,17.46571,16.44621,15.97754,15.57301,GALAXY,0.082067,5050,56215
1,7.273359,12.007943,19.51252,17.58586,16.59778,16.18047,15.75792,GALAXY,0.092747,6190,56210
2,347.324134,0.064354,18.22867,16.31122,15.43749,14.96799,14.55907,GALAXY,0.015748,678,52884
3,327.676605,9.454218,18.80359,16.91130,16.02542,15.61801,15.29591,GALAXY,0.071726,4095,55497
4,173.838263,44.396655,19.56951,18.29280,17.76531,17.58614,17.52179,STAR,-0.000070,3215,54861
...,...,...,...,...,...,...,...,...,...,...,...
99995,157.401770,55.280518,18.85913,17.81282,17.39851,17.21817,17.17015,STAR,-0.000221,3177,54833
99996,165.798849,56.876348,18.37093,17.50092,17.22220,17.11626,17.08536,STAR,-0.000739,7102,56666
99997,37.736250,25.176344,18.10809,16.26831,15.53559,15.24569,15.07602,STAR,-0.000072,2379,53762
99998,205.885357,0.729868,17.48545,16.50522,16.26882,16.18110,16.15477,STAR,-0.000338,299,51671


In [6]:
labels = {'STAR' :1, 'GALAXY': 2, 'QSO': 3}

df_train.replace({'class':labels}, inplace = True)

df_train

Unnamed: 0,ra,dec,u,g,r,i,z,class,redshift,plate,mjd
0,342.697634,11.107615,19.55744,17.46571,16.44621,15.97754,15.57301,2,0.082067,5050,56215
1,7.273359,12.007943,19.51252,17.58586,16.59778,16.18047,15.75792,2,0.092747,6190,56210
2,347.324134,0.064354,18.22867,16.31122,15.43749,14.96799,14.55907,2,0.015748,678,52884
3,327.676605,9.454218,18.80359,16.91130,16.02542,15.61801,15.29591,2,0.071726,4095,55497
4,173.838263,44.396655,19.56951,18.29280,17.76531,17.58614,17.52179,1,-0.000070,3215,54861
...,...,...,...,...,...,...,...,...,...,...,...
99995,157.401770,55.280518,18.85913,17.81282,17.39851,17.21817,17.17015,1,-0.000221,3177,54833
99996,165.798849,56.876348,18.37093,17.50092,17.22220,17.11626,17.08536,1,-0.000739,7102,56666
99997,37.736250,25.176344,18.10809,16.26831,15.53559,15.24569,15.07602,1,-0.000072,2379,53762
99998,205.885357,0.729868,17.48545,16.50522,16.26882,16.18110,16.15477,1,-0.000338,299,51671


# Performance Calculation

## Imbalance Data

In [7]:
x = df_train.drop('class', axis = 1)
y = df_train['class']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 123)

### KNN

In [9]:
knn = KNeighborsClassifier(n_neighbors=5)

In [10]:
knn.fit(X_train, y_train)

In [11]:
yhat = knn.predict(X_test)

In [12]:
acc = accuracy_score(y_test, yhat)
print("Accuracy:", acc)
prec = precision_score(y_test, yhat, average='macro')
print("Precision:", prec)
rec = recall_score(y_test, yhat, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test, yhat, average='macro')
print("F1 Score:", f1)

Accuracy: 0.8333666666666667
Precision: 0.8136597030727479
Recall: 0.7226751888538239
F1 Score: 0.7509487529948077


### Logistic Regression

In [13]:
lr = LogisticRegression(class_weight = {1:10, 2:38, 3:42 }, max_iter = 1000)

In [14]:
lr.fit(X_train, y_train)

In [15]:
yhat = lr.predict(X_test)

In [16]:
acc = accuracy_score(y_test, yhat)
print("Accuracy:", acc)
prec = precision_score(y_test, yhat, average='macro')
print("Precision:", prec)
rec = recall_score(y_test, yhat, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test, yhat, average='macro')
print("F1 Score:", f1)

Accuracy: 0.6372666666666666
Precision: 0.6799576997747776
Recall: 0.6399397759192383
F1 Score: 0.567548516867753


### ADABoost

In [17]:
adb =  AdaBoostClassifier()

In [18]:
adb.fit(X_train, y_train)

In [19]:
yhat = adb.predict(X_test)

In [20]:
acc = accuracy_score(y_test, yhat)
print("Accuracy:", acc)
prec = precision_score(y_test, yhat, average='macro')
print("Precision:", prec)
rec = recall_score(y_test, yhat, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test, yhat, average='macro')
print("F1 Score:", f1)

Accuracy: 0.8126
Precision: 0.8959335353411254
Recall: 0.650212007334976
F1 Score: 0.6825804777658563


### Naive Bayes

In [21]:
nb = GaussianNB()

In [22]:
nb.fit(X_train, y_train)

In [23]:
yhat = nb.predict(X_test)

In [24]:
acc = accuracy_score(y_test, yhat)
print("Accuracy:", acc)
prec = precision_score(y_test, yhat, average='macro')
print("Precision:", prec)
rec = recall_score(y_test, yhat, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test, yhat, average='macro')
print("F1 Score:", f1)

Accuracy: 0.7658333333333334
Precision: 0.7680569136811726
Recall: 0.7872751078629969
F1 Score: 0.7650764104422393


### MLP

In [25]:
mlp = MLPClassifier()

In [26]:
mlp.fit(X_train, y_train)

In [27]:
yhat = mlp.predict(X_test)

In [28]:
acc = accuracy_score(y_test, yhat)
print("Accuracy:", acc)
prec = precision_score(y_test, yhat, average='macro')
print("Precision:", prec)
rec = recall_score(y_test, yhat, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test, yhat, average='macro')
print("F1 Score:", f1)

Accuracy: 0.7449
Precision: 0.7397487915556709
Recall: 0.7370007144157257
F1 Score: 0.7379245502312015


## SMOTE

In [29]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='minority')

In [30]:
x_resample_1, y_resample_1 = smote.fit_resample(x, y)
pd.DataFrame(x_resample_1).to_csv('D://datasets/sky_server/processed/x_resample_1.csv', index = False)
pd.DataFrame(y_resample_1).to_csv('D://datasets/sky_server/processed/y_resample_1.csv', index = False)

In [31]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(x_resample_1, y_resample_1, test_size=0.3, random_state=123)

In [32]:
y_resample_1.value_counts()

2    51240
3    51240
1    38160
Name: class, dtype: int64

### KNN

In [33]:
knn.fit(X_train_1, y_train_1)

In [34]:
yhat_1 = knn.predict(X_test_1)

In [35]:
acc = accuracy_score(y_test_1, yhat_1)
print("Accuracy:", acc)
prec = precision_score(y_test_1, yhat_1, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_1, yhat_1, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_1, yhat_1, average='macro')
print("F1 Score:", f1)

Accuracy: 0.8667282897231703
Precision: 0.8678965099951613
Recall: 0.8552149718757821
F1 Score: 0.858859721383471


### Logistic Regression

In [36]:
lr= LogisticRegression(max_iter=1000000)
lr.fit(X_train_1, y_train_1)

In [37]:
yhat_1 = lr.predict(X_test_1)

In [38]:
acc = accuracy_score(y_test_1, yhat_1)
print("Accuracy:", acc)
prec = precision_score(y_test_1, yhat_1, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_1, yhat_1, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_1, yhat_1, average='macro')
print("F1 Score:", f1)

Accuracy: 0.7623483124762989
Precision: 0.7558974034835234
Recall: 0.7330075448289747
F1 Score: 0.7281441432396507


### ADABoost

In [39]:
adb =  AdaBoostClassifier(n_estimators=3)
adb.fit(X_train_1, y_train_1)

In [40]:
yhat_1 = adb.predict(X_test_1)

In [41]:
acc = accuracy_score(y_test_1, yhat_1)
print("Accuracy:", acc)
prec = precision_score(y_test_1, yhat_1, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_1, yhat_1, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_1, yhat_1, average='macro')
print("F1 Score:", f1)

Accuracy: 0.9810864618885097
Precision: 0.9818855240655934
Recall: 0.9824775141610439
F1 Score: 0.982104085311267


### Naive Bayes

In [42]:
nb.fit(X_train_1, y_train_1)

In [43]:
yhat_1 = nb.predict(X_test_1)

In [44]:
acc = accuracy_score(y_test_1, yhat_1)
print("Accuracy:", acc)
prec = precision_score(y_test_1, yhat_1, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_1, yhat_1, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_1, yhat_1, average='macro')
print("F1 Score:", f1)

Accuracy: 0.796833522942738
Precision: 0.7954125228454011
Recall: 0.7699127609402149
F1 Score: 0.7682192627230112


### MLP

In [45]:
mlp.fit(X_train_1, y_train_1)

In [46]:
yhat_1 = mlp.predict(X_test_1)

In [47]:
acc = accuracy_score(y_test_1, yhat_1)
print("Accuracy:", acc)
prec = precision_score(y_test_1, yhat_1, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_1, yhat_1, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_1, yhat_1, average='macro')
print("F1 Score:", f1)

Accuracy: 0.8340443686006825
Precision: 0.8292992841541015
Recall: 0.8199426183752228
F1 Score: 0.8228405244619079


## ADASYN

In [48]:
from imblearn.over_sampling import ADASYN
adasyn = ADASYN(sampling_strategy='not minority')

In [49]:
x_resample_2, y_resample_2 = adasyn.fit_resample(x, y)
pd.DataFrame(x_resample_2).to_csv('D://datasets/sky_server/processed/x_resample_2.csv', index = False)
pd.DataFrame(y_resample_2).to_csv('D://datasets/sky_server/processed/y_resample_2.csv', index = False)

In [50]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(x_resample_2, y_resample_2, test_size=0.25, random_state=1)

### KNN

In [51]:
knn.fit(X_train_2, y_train_2)

In [52]:
yhat_2 = knn.predict(X_test_2)

In [53]:
acc = accuracy_score(y_test_2, yhat_2)
print("Accuracy:", acc)
prec = precision_score(y_test_2, yhat_2, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_2, yhat_2, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_2, yhat_2, average='macro')
print("F1 Score:", f1)

Accuracy: 0.8195467521939872
Precision: 0.8093880015315212
Recall: 0.6963680988844053
F1 Score: 0.7254480198707723


### Logistic Regression

In [54]:
lr.fit(X_train_2, y_train_2)

In [55]:
yhat_2 = lr.predict(X_test_2)

In [56]:
acc = accuracy_score(y_test_2, yhat_2)
print("Accuracy:", acc)
prec = precision_score(y_test_2, yhat_2, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_2, yhat_2, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_2, yhat_2, average='macro')
print("F1 Score:", f1)

Accuracy: 0.7412328622281746
Precision: 0.7517396100580962
Recall: 0.7250190128320592
F1 Score: 0.736931334607493


### ADABoost

In [57]:
adb.fit(X_train_2, y_train_2)

In [58]:
yhat_2 = adb.predict(X_test_2)

In [59]:
acc = accuracy_score(y_test_2, yhat_2)
print("Accuracy:", acc)
prec = precision_score(y_test_2, yhat_2, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_2, yhat_2, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_2, yhat_2, average='macro')
print("F1 Score:", f1)

Accuracy: 0.9035350509286998
Precision: 0.6067405507842231
Recall: 0.6632030603804798
F1 Score: 0.632651999009522


### Naive Bayes

In [60]:
nb.fit(X_train_2, y_train_2)

In [61]:
yhat_2 = nb.predict(X_test_2)

In [62]:
acc = accuracy_score(y_test_2, yhat_2)
print("Accuracy:", acc)
prec = precision_score(y_test_2, yhat_2, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_2, yhat_2, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_2, yhat_2, average='macro')
print("F1 Score:", f1)

Accuracy: 0.7524759454410883
Precision: 0.765203634009052
Recall: 0.7971783897400475
F1 Score: 0.7663272154164975


### MLP

In [63]:
mlp.fit(X_train_2, y_train_2)

In [64]:
yhat_2 = mlp.predict(X_test_2)

In [65]:
acc = accuracy_score(y_test_2, yhat_2)
print("Accuracy:", acc)
prec = precision_score(y_test_2, yhat_2, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_2, yhat_2, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_2, yhat_2, average='macro')
print("F1 Score:", f1)

Accuracy: 0.4544108835865083
Precision: 0.4895137048983305
Recall: 0.335459794672877
F1 Score: 0.21401797802583053


## Borderline SMOTE

In [66]:
from imblearn.over_sampling import BorderlineSMOTE
boderline_smote = BorderlineSMOTE(sampling_strategy='minority')

In [67]:
x_resample_3, y_resample_3 = boderline_smote.fit_resample(x, y)
pd.DataFrame(x_resample_3).to_csv('D://datasets/sky_server/processed/x_resample_3.csv', index = False)
pd.DataFrame(y_resample_3).to_csv('D://datasets/sky_server/processed/y_resample_3.csv', index = False)

In [68]:
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(x_resample_3, y_resample_3, test_size=0.25, random_state=1)

### KNN

In [69]:
knn.fit(X_train_3, y_train_3)

In [70]:
yhat_3 = knn.predict(X_test_3)

In [71]:
acc = accuracy_score(y_test_3, yhat_3)
print("Accuracy:", acc)
prec = precision_score(y_test_3, yhat_3, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_3, yhat_3, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_3, yhat_3, average='macro')
print("F1 Score:", f1)

Accuracy: 0.8802332195676905
Precision: 0.8829239732672014
Recall: 0.8677501141197306
F1 Score: 0.8718030311816709


### Logistic Regression

In [72]:
lr.fit(X_train_3, y_train_3)

In [73]:
yhat_3 = lr.predict(X_test_3)

In [74]:
acc = accuracy_score(y_test_3, yhat_3)
print("Accuracy:", acc)
prec = precision_score(y_test_3, yhat_3, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_3, yhat_3, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_3, yhat_3, average='macro')
print("F1 Score:", f1)

Accuracy: 0.865358361774744
Precision: 0.8607082658012745
Recall: 0.857429330735416
F1 Score: 0.8586820709379753


### ADABoost

In [75]:
adb.fit(X_train_3, y_train_3)

In [76]:
yhat_3 = adb.predict(X_test_3)

In [77]:
acc = accuracy_score(y_test_3, yhat_3)
print("Accuracy:", acc)
prec = precision_score(y_test_3, yhat_3, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_3, yhat_3, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_3, yhat_3, average='macro')
print("F1 Score:", f1)

Accuracy: 0.9830204778156997
Precision: 0.9834655226776502
Recall: 0.9842635402149537
F1 Score: 0.9838093032454159


### Naive Bayes

In [78]:
nb.fit(X_train_3, y_train_3)

In [79]:
yhat_3 = nb.predict(X_test_3)

In [80]:
acc = accuracy_score(y_test_3, yhat_3)
print("Accuracy:", acc)
prec = precision_score(y_test_3, yhat_3, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_3, yhat_3, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_3, yhat_3, average='macro')
print("F1 Score:", f1)

Accuracy: 0.8013936291240046
Precision: 0.7988170601914327
Recall: 0.7741959859876202
F1 Score: 0.7719430060175942


### MLP

In [81]:
mlp.fit(X_train_3, y_train_3)

In [82]:
yhat_3 = mlp.predict(X_test_3)

In [83]:
acc = accuracy_score(y_test_3, yhat_3)
print("Accuracy:", acc)
prec = precision_score(y_test_3, yhat_3, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_3, yhat_3, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_3, yhat_3, average='macro')
print("F1 Score:", f1)

Accuracy: 0.5699943117178612
Precision: 0.6831482301296871
Recall: 0.5903802476874142
F1 Score: 0.5486258820695584


## SVM-SMOTE

In [84]:
from imblearn.over_sampling import SVMSMOTE 
svm_smote = SVMSMOTE(sampling_strategy='minority')

In [85]:
x_resample_4, y_resample_4 = svm_smote.fit_resample(x, y)
pd.DataFrame(x_resample_4).to_csv('D://datasets/sky_server/processed/x_resample_4.csv', index = False)
pd.DataFrame(y_resample_4).to_csv('D://datasets/sky_server/processed/y_resample_4.csv', index = False)

In [86]:
X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(x_resample_4, y_resample_4, test_size=0.25, random_state=1)

### KNN

In [87]:
knn.fit(X_train_4, y_train_4)

In [88]:
yhat_4 = knn.predict(X_test_4)

In [89]:
acc = accuracy_score(y_test_4, yhat_4)
print("Accuracy:", acc)
prec = precision_score(y_test_4, yhat_4, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_4, yhat_4, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_4, yhat_4, average='macro')
print("F1 Score:", f1)

Accuracy: 0.8822810011376564
Precision: 0.8832492396827059
Recall: 0.8707510414666317
F1 Score: 0.8743265830463182


### Logistic Regression

In [90]:
lr.fit(X_train_4, y_train_4)

In [91]:
yhat_4 = lr.predict(X_test_4)

In [92]:
acc = accuracy_score(y_test_4, yhat_4)
print("Accuracy:", acc)
prec = precision_score(y_test_4, yhat_4, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_4, yhat_4, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_4, yhat_4, average='macro')
print("F1 Score:", f1)

Accuracy: 0.7757110352673493
Precision: 0.7659406607284606
Recall: 0.746793801025542
F1 Score: 0.7422004101026696


### ADABoost

In [93]:
adb.fit(X_train_4, y_train_4)

In [94]:
yhat_4 = adb.predict(X_test_4)

In [95]:
acc = accuracy_score(y_test_4, yhat_4)
print("Accuracy:", acc)
prec = precision_score(y_test_4, yhat_4, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_4, yhat_4, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_4, yhat_4, average='macro')
print("F1 Score:", f1)

Accuracy: 0.9825369738339021
Precision: 0.9830874237165516
Recall: 0.983724417232406
F1 Score: 0.9833305701319617


### Naive Bayes

In [96]:
nb.fit(X_train_4, y_train_4)

In [97]:
yhat_4 = nb.predict(X_test_4)

In [98]:
acc = accuracy_score(y_test_4, yhat_4)
print("Accuracy:", acc)
prec = precision_score(y_test_4, yhat_4, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_4, yhat_4, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_4, yhat_4, average='macro')
print("F1 Score:", f1)

Accuracy: 0.807764505119454
Precision: 0.804714285746209
Recall: 0.7806718167788468
F1 Score: 0.7786875036498238


### MLP

In [99]:
mlp.fit(X_train_4, y_train_4)

In [100]:
yhat_4 = mlp.predict(X_test_4)

In [101]:
acc = accuracy_score(y_test_4, yhat_4)
print("Accuracy:", acc)
prec = precision_score(y_test_4, yhat_4, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_4, yhat_4, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_4, yhat_4, average='macro')
print("F1 Score:", f1)

Accuracy: 0.819254835039818
Precision: 0.8169674916638355
Recall: 0.7938560302502097
F1 Score: 0.7931695863659537


## Cluster Centroid

In [102]:
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(sampling_strategy='majority', voting='hard')

In [103]:
x_resample_5, y_resample_5 = cc.fit_resample(x, y)
pd.DataFrame(x_resample_5).to_csv('D://datasets/sky_server/processed/x_resample_5.csv', index = False)
pd.DataFrame(y_resample_5).to_csv('D://datasets/sky_server/processed/y_resample_5.csv', index = False)

In [104]:
X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(x_resample_5, y_resample_5, test_size=0.25, random_state=1)

### KNN

In [105]:
knn.fit(X_train_5, y_train_5)

In [106]:
yhat_5 = knn.predict(X_test_5)

In [107]:
acc = accuracy_score(y_test_5, yhat_5)
print("Accuracy:", acc)
prec = precision_score(y_test_5, yhat_5, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_5, yhat_5, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_5, yhat_5, average='macro')
print("F1 Score:", f1)

Accuracy: 0.7424528301886792
Precision: 0.6789388731045505
Recall: 0.6248547504098921
F1 Score: 0.6455001580667238


### Logistic Regression

In [108]:
lr.fit(X_train_5, y_train_5)

In [109]:
yhat_5 = lr.predict(X_test_5)

In [110]:
acc = accuracy_score(y_test_5, yhat_5)
print("Accuracy:", acc)
prec = precision_score(y_test_5, yhat_5, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_5, yhat_5, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_5, yhat_5, average='macro')
print("F1 Score:", f1)

Accuracy: 0.8487870619946092
Precision: 0.8357566267586264
Recall: 0.774172422938137
F1 Score: 0.800659442936119


### ADABoost

In [111]:
adb.fit(X_train_5, y_train_5)

In [112]:
yhat_5 = adb.predict(X_test_5)

In [113]:
acc = accuracy_score(y_test_5, yhat_5)
print("Accuracy:", acc)
prec = precision_score(y_test_5, yhat_5, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_5, yhat_5, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_5, yhat_5, average='macro')
print("F1 Score:", f1)

Accuracy: 0.8235175202156334
Precision: 0.5005557099870132
Recall: 0.6662883087400681
F1 Score: 0.5570114146262708


### Naive Bayes

In [114]:
nb.fit(X_train_5, y_train_5)

In [115]:
yhat_5 = nb.predict(X_test_5)

In [116]:
acc = accuracy_score(y_test_5, yhat_5)
print("Accuracy:", acc)
prec = precision_score(y_test_5, yhat_5, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_5, yhat_5, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_5, yhat_5, average='macro')
print("F1 Score:", f1)

Accuracy: 0.7842991913746631
Precision: 0.7194358265859621
Recall: 0.7543993544103236
F1 Score: 0.7340185787296902


### MLP

In [117]:
mlp.fit(X_train_5, y_train_5)

In [118]:
yhat_5 = mlp.predict(X_test_5)

In [119]:
acc = accuracy_score(y_test_5, yhat_5)
print("Accuracy:", acc)
prec = precision_score(y_test_5, yhat_5, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_5, yhat_5, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_5, yhat_5, average='macro')
print("F1 Score:", f1)

Accuracy: 0.6538409703504043
Precision: 0.6978516038820081
Recall: 0.465690650192343
F1 Score: 0.4342827873408503


## EditedNearestNeighbours

In [120]:
from imblearn.under_sampling import EditedNearestNeighbours
enn = EditedNearestNeighbours(sampling_strategy = 'majority') 

In [121]:
x_resample_6, y_resample_6 = enn.fit_resample(x, y)
pd.DataFrame(x_resample_6).to_csv('D://datasets/sky_server/processed/x_resample_6.csv', index = False)
pd.DataFrame(y_resample_6).to_csv('D://datasets/sky_server/processed/y_resample_6.csv', index = False)

In [122]:
X_train_6, X_test_6, y_train_6, y_test_6 = train_test_split(x_resample_6, y_resample_6, test_size=0.25, random_state=1)

### KNN

In [123]:
knn.fit(X_train_6, y_train_6)

In [124]:
yhat_6 = knn.predict(X_test_6)

In [125]:
acc = accuracy_score(y_test_6, yhat_6)
print("Accuracy:", acc)
prec = precision_score(y_test_6, yhat_6, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_6, yhat_6, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_6, yhat_6, average='macro')
print("F1 Score:", f1)

Accuracy: 0.8480359147025813
Precision: 0.8207446418265004
Recall: 0.7522770751268765
F1 Score: 0.7727923947230408


### Logistic Regression

In [126]:
lr.fit(X_train_6, y_train_6)

In [127]:
yhat_6 = lr.predict(X_test_6)

In [128]:
acc = accuracy_score(y_test_6, yhat_6)
print("Accuracy:", acc)
prec = precision_score(y_test_6, yhat_6, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_6, yhat_6, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_6, yhat_6, average='macro')
print("F1 Score:", f1)

Accuracy: 0.6762289562289562
Precision: 0.45186185864658296
Recall: 0.5114146552436981
F1 Score: 0.47876352083688983


### ADABoost

In [129]:
adb.fit(X_train_6, y_train_6)

In [130]:
yhat_6 = adb.predict(X_test_6)

In [131]:
acc = accuracy_score(y_test_6, yhat_6)
print("Accuracy:", acc)
prec = precision_score(y_test_6, yhat_6, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_6, yhat_6, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_6, yhat_6, average='macro')
print("F1 Score:", f1)

Accuracy: 0.8782491582491583
Precision: 0.5944415994400436
Recall: 0.6650285828903821
F1 Score: 0.6257483219618348


### Naive Bayes

In [132]:
nb.fit(X_train_6, y_train_6)

In [133]:
yhat_6 = nb.predict(X_test_6)

In [134]:
acc = accuracy_score(y_test_6, yhat_6)
print("Accuracy:", acc)
prec = precision_score(y_test_6, yhat_6, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_6, yhat_6, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_6, yhat_6, average='macro')
print("F1 Score:", f1)

Accuracy: 0.7881032547699215
Precision: 0.7950034420296821
Recall: 0.8154557552538623
F1 Score: 0.7941391327220743


### MLP

In [135]:
mlp.fit(X_train_6, y_train_6)

In [136]:
yhat_6 = mlp.predict(X_test_6)

In [137]:
acc = accuracy_score(y_test_6, yhat_6)
print("Accuracy:", acc)
prec = precision_score(y_test_6, yhat_6, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_6, yhat_6, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_6, yhat_6, average='macro')
print("F1 Score:", f1)

Accuracy: 0.5673625140291807
Precision: 0.8032421337935234
Recall: 0.6363721383747647
F1 Score: 0.5746352078056338


## Near Miss

In [138]:
from imblearn.under_sampling import NearMiss
nm = NearMiss(sampling_strategy='majority')

In [139]:
x_resample_7, y_resample_7 = nm.fit_resample(x, y)
pd.DataFrame(x_resample_7).to_csv('D://datasets/sky_server/processed/x_resample_7.csv', index = False)
pd.DataFrame(y_resample_7).to_csv('D://datasets/sky_server/processed/y_resample_7.csv', index = False)

In [140]:
X_train_7, X_test_7, y_train_7, y_test_7 = train_test_split(x_resample_7, y_resample_7, test_size=0.25, random_state=1)

### KNN

In [141]:
knn.fit(X_train_7, y_train_7)

In [142]:
yhat_7 = knn.predict(X_test_7)

In [143]:
acc = accuracy_score(y_test_7, yhat_7)
print("Accuracy:", acc)
prec = precision_score(y_test_7, yhat_7, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_7, yhat_7, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_7, yhat_7, average='macro')
print("F1 Score:", f1)

Accuracy: 0.8181266846361186
Precision: 0.7563408848393288
Recall: 0.7621649149466734
F1 Score: 0.7491374517865727


### Logistic Regression

In [144]:
lr.fit(X_train_7, y_train_7)

In [145]:
yhat_7 = lr.predict(X_test_7)

In [146]:
acc = accuracy_score(y_test_7, yhat_7)
print("Accuracy:", acc)
prec = precision_score(y_test_7, yhat_7, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_7, yhat_7, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_7, yhat_7, average='macro')
print("F1 Score:", f1)

Accuracy: 0.8151617250673855
Precision: 0.7834525131347704
Recall: 0.7255231686716197
F1 Score: 0.7477229537468596


### ADABoost

In [147]:
adb.fit(X_train_7, y_train_7)

In [148]:
yhat_7 = adb.predict(X_test_7)

In [149]:
acc = accuracy_score(y_test_7, yhat_7)
print("Accuracy:", acc)
prec = precision_score(y_test_7, yhat_7, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_7, yhat_7, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_7, yhat_7, average='macro')
print("F1 Score:", f1)

Accuracy: 0.8236522911051213
Precision: 0.5007197535862487
Recall: 0.6665405473578004
F1 Score: 0.5565256902860738


### Naive Bayes

In [150]:
nb.fit(X_train_7, y_train_7)

In [151]:
yhat_7 = nb.predict(X_test_7)

In [152]:
acc = accuracy_score(y_test_7, yhat_7)
print("Accuracy:", acc)
prec = precision_score(y_test_7, yhat_7, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_7, yhat_7, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_7, yhat_7, average='macro')
print("F1 Score:", f1)

Accuracy: 0.815566037735849
Precision: 0.7674269672042975
Recall: 0.8558257240054701
F1 Score: 0.7951745120610956


### MLP

In [153]:
mlp.fit(X_train_7, y_train_7)

In [154]:
yhat_7 = mlp.predict(X_test_7)

In [155]:
acc = accuracy_score(y_test_7, yhat_7)
print("Accuracy:", acc)
prec = precision_score(y_test_7, yhat_7, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_7, yhat_7, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_7, yhat_7, average='macro')
print("F1 Score:", f1)

Accuracy: 0.4267520215633423
Precision: 0.4094999362892206
Recall: 0.4609817911019056
F1 Score: 0.31300036406012444


## NeighbourhoodCleaningRule

In [156]:
from imblearn.under_sampling import NeighbourhoodCleaningRule
ncr = NeighbourhoodCleaningRule(sampling_strategy='majority')

In [157]:
x_resample_8, y_resample_8 = ncr.fit_resample(x, y)
pd.DataFrame(x_resample_8).to_csv('D://datasets/sky_server/processed/x_resample_8.csv', index = False)
pd.DataFrame(y_resample_8).to_csv('D://datasets/sky_server/processed/y_resample_8.csv', index = False)

In [158]:
X_train_8, X_test_8, y_train_8, y_test_8 = train_test_split(x_resample_8, y_resample_8, test_size=0.25, random_state=1)

### KNN

In [159]:
knn.fit(X_train_8, y_train_8)

In [160]:
yhat_8 = knn.predict(X_test_8)

In [161]:
acc = accuracy_score(y_test_8, yhat_8)
print("Accuracy:", acc)
prec = precision_score(y_test_8, yhat_8, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_8, yhat_8, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_8, yhat_8, average='macro')
print("F1 Score:", f1)

Accuracy: 0.8492298822172802
Precision: 0.824566638738672
Recall: 0.7364786990971993
F1 Score: 0.7633121598162068


### Logistic Regression

In [162]:
lr.fit(X_train_8, y_train_8)

In [163]:
yhat_8 = lr.predict(X_test_8)

In [164]:
acc = accuracy_score(y_test_8, yhat_8)
print("Accuracy:", acc)
prec = precision_score(y_test_8, yhat_8, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_8, yhat_8, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_8, yhat_8, average='macro')
print("F1 Score:", f1)

Accuracy: 0.7869203525245038
Precision: 0.7949695294479492
Recall: 0.7683249178884527
F1 Score: 0.7784657547764646


### ADABoost

In [165]:
adb.fit(X_train_8, y_train_8)

In [166]:
yhat_8 = adb.predict(X_test_8)

In [167]:
acc = accuracy_score(y_test_8, yhat_8)
print("Accuracy:", acc)
prec = precision_score(y_test_8, yhat_8, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_8, yhat_8, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_8, yhat_8, average='macro')
print("F1 Score:", f1)

Accuracy: 0.9852977514208056
Precision: 0.9781841496881573
Recall: 0.9719909463503985
F1 Score: 0.9750312762533332


### Naive Bayes

In [168]:
nb.fit(X_train_8, y_train_8)

In [169]:
yhat_8 = nb.predict(X_test_8)

In [170]:
acc = accuracy_score(y_test_8, yhat_8)
print("Accuracy:", acc)
prec = precision_score(y_test_8, yhat_8, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_8, yhat_8, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_8, yhat_8, average='macro')
print("F1 Score:", f1)

Accuracy: 0.7848200312989045
Precision: 0.7927607844068967
Recall: 0.8050003145749649
F1 Score: 0.7878155935091439


### MLP

In [171]:
mlp.fit(X_train_8, y_train_8)

In [172]:
yhat_8 = mlp.predict(X_test_8)

In [173]:
acc = accuracy_score(y_test_8, yhat_8)
print("Accuracy:", acc)
prec = precision_score(y_test_8, yhat_8, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_8, yhat_8, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_8, yhat_8, average='macro')
print("F1 Score:", f1)

Accuracy: 0.6284078741454575
Precision: 0.715540943607643
Recall: 0.6573358715538876
F1 Score: 0.5852797506344155


## SMOTE ENN

In [174]:
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(sampling_strategy='auto')

In [175]:
x_resample_9, y_resample_9 = smote_enn.fit_resample(x, y)
pd.DataFrame(x_resample_9).to_csv('D://datasets/sky_server/processed/x_resample_9.csv', index = False)
pd.DataFrame(y_resample_9).to_csv('D://datasets/sky_server/processed/y_resample_9.csv', index = False)

In [176]:
X_train_9, X_test_9, y_train_9, y_test_9 = train_test_split(x_resample_9, y_resample_9, test_size=0.25, random_state=1)

### KNN

In [177]:
knn.fit(X_train_9, y_train_9)

In [178]:
yhat_9 = knn.predict(X_test_9)

In [179]:
acc = accuracy_score(y_test_9, yhat_9)
print("Accuracy:", acc)
prec = precision_score(y_test_9, yhat_9, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_9, yhat_9, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_9, yhat_9, average='macro')
print("F1 Score:", f1)

Accuracy: 0.9643922751565626
Precision: 0.9647648455739785
Recall: 0.9643701832745207
F1 Score: 0.964500725972754


### Logistic Regression

In [180]:
lr.fit(X_train_9, y_train_9)

In [181]:
yhat_9 = lr.predict(X_test_9)

In [182]:
acc = accuracy_score(y_test_9, yhat_9)
print("Accuracy:", acc)
prec = precision_score(y_test_9, yhat_9, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_9, yhat_9, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_9, yhat_9, average='macro')
print("F1 Score:", f1)

Accuracy: 0.8816026754975573
Precision: 0.8782323178059098
Recall: 0.8777947365433668
F1 Score: 0.8778544109646313


### ADABoost

In [183]:
adb.fit(X_train_9, y_train_9)

In [184]:
yhat_9 = adb.predict(X_test_9)

In [185]:
acc = accuracy_score(y_test_9, yhat_9)
print("Accuracy:", acc)
prec = precision_score(y_test_9, yhat_9, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_9, yhat_9, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_9, yhat_9, average='macro')
print("F1 Score:", f1)

Accuracy: 0.9904587035640513
Precision: 0.9902389240306513
Recall: 0.9908983006206832
F1 Score: 0.9905363608044566


### Naive Bayes

In [186]:
nb.fit(X_train_9, y_train_9)

In [187]:
yhat_9 = nb.predict(X_test_9)

In [188]:
acc = accuracy_score(y_test_9, yhat_9)
print("Accuracy:", acc)
prec = precision_score(y_test_9, yhat_9, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_9, yhat_9, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_9, yhat_9, average='macro')
print("F1 Score:", f1)

Accuracy: 0.8590117708777337
Precision: 0.8600759038463885
Recall: 0.8533195886468024
F1 Score: 0.8507842214599647


### MLP

In [189]:
mlp.fit(X_train_9, y_train_9)

In [190]:
yhat_9 = mlp.predict(X_test_9)

In [191]:
acc = accuracy_score(y_test_9, yhat_9)
print("Accuracy:", acc)
prec = precision_score(y_test_9, yhat_9, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_9, yhat_9, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_9, yhat_9, average='macro')
print("F1 Score:", f1)

Accuracy: 0.707072363028296
Precision: 0.8394000019907034
Recall: 0.6988239062847107
F1 Score: 0.661314555161073


## SMOTE Tomek

In [192]:
from imblearn.combine import SMOTETomek
smote_tomek = SMOTETomek(sampling_strategy='auto')

In [193]:
x_resample_10, y_resample_10= smote_tomek.fit_resample(x, y)
pd.DataFrame(x_resample_10).to_csv('D://datasets/sky_server/processed/x_resample_10.csv', index = False)
pd.DataFrame(y_resample_10).to_csv('D://datasets/sky_server/processed/y_resample_10.csv', index = False)

In [194]:
X_train_10, X_test_10, y_train_10, y_test_10 = train_test_split(x_resample_10, y_resample_10, test_size=0.25, random_state=1)

### KNN

In [195]:
knn.fit(X_train_10, y_train_10)

In [196]:
yhat_10 = knn.predict(X_test_10)

In [197]:
acc = accuracy_score(y_test_10, yhat_10)
print("Accuracy:", acc)
prec = precision_score(y_test_10, yhat_10, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_10, yhat_10, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_10, yhat_10, average='macro')
print("F1 Score:", f1)

Accuracy: 0.8860400385397709
Precision: 0.8865050969644105
Recall: 0.885385096392512
F1 Score: 0.8851875592107397


### Logistic Regression

In [198]:
lr.fit(X_train_10, y_train_10)

In [199]:
yhat_10 = lr.predict(X_test_10)

In [200]:
acc = accuracy_score(y_test_10, yhat_10)
print("Accuracy:", acc)
prec = precision_score(y_test_10, yhat_10, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_10, yhat_10, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_10, yhat_10, average='macro')
print("F1 Score:", f1)

Accuracy: 0.8615244620490311
Precision: 0.8641145757961701
Recall: 0.8608641553990314
F1 Score: 0.8620648494014981


### ADABoost

In [201]:
adb.fit(X_train_10, y_train_10)

In [202]:
yhat_10 = adb.predict(X_test_10)

In [203]:
acc = accuracy_score(y_test_10, yhat_10)
print("Accuracy:", acc)
prec = precision_score(y_test_10, yhat_10, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_10, yhat_10, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_10, yhat_10, average='macro')
print("F1 Score:", f1)

Accuracy: 0.9840755807729366
Precision: 0.9842479563781019
Recall: 0.9842716051918653
F1 Score: 0.9841915000227536


### Naive Bayes

In [204]:
nb.fit(X_train_10, y_train_10)

In [205]:
yhat_10 = nb.predict(X_test_10)

In [206]:
acc = accuracy_score(y_test_10, yhat_10)
print("Accuracy:", acc)
prec = precision_score(y_test_10, yhat_10, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_10, yhat_10, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_10, yhat_10, average='macro')
print("F1 Score:", f1)

Accuracy: 0.8018948720693716
Precision: 0.8097005879369344
Recall: 0.7991219962259409
F1 Score: 0.7921134748312125


### MLP

In [207]:
mlp.fit(X_train_10, y_train_10)

In [208]:
yhat_10 = mlp.predict(X_test_10)

In [209]:
acc = accuracy_score(y_test_10, yhat_10)
print("Accuracy:", acc)
prec = precision_score(y_test_10, yhat_10, average='macro')
print("Precision:", prec)
rec = recall_score(y_test_10, yhat_10, average='macro')
print("Recall:", rec)
f1 = f1_score(y_test_10, yhat_10, average='macro')
print("F1 Score:", f1)

Accuracy: 0.7204260785783106
Precision: 0.8266480618393223
Recall: 0.7158711500776241
F1 Score: 0.6897213458613107
