In [14]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder,StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


In [15]:

def remove_minus_9999(df, features):
    return df[~(df[features] == -9999).any(axis=1)].copy()

def impute_u_by_class(df, target_col='class'):
    means = df.groupby(target_col)['u'].mean()
    for cls, mean_val in means.items():
        mask = (df[target_col] == cls) & (df['u'].isnull())
        df.loc[mask, 'u'] = mean_val
    return df

def remove_outliers_iqr(df, features, threshold=2.5):
    for col in features:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - threshold * IQR
        upper = Q3 + threshold * IQR
        df = df[(df[col] >= lower) & (df[col] <= upper)]
    return df.copy()

def feature_engineer(df):
    df = df.copy()
    df['u_g'] = df['u'] - df['g']
    df['g_r'] = df['g'] - df['r']
    df['r_i'] = df['r'] - df['i']
    df['i_z'] = df['i'] - df['z']
    return df

def drop_columns(df, cols):
    return df.drop(columns=cols, errors='ignore')

DROP_COLS = ['rerun_ID', 'obj_ID', 'spec_obj_ID']
FILTER_COLS = ['u', 'g', 'r', 'i', 'z']

final_df = pd.read_csv('train.csv')

le = LabelEncoder()
final_df['class'] = le.fit_transform(final_df['class'])

final_df = remove_minus_9999(final_df, FILTER_COLS)
final_df = impute_u_by_class(final_df, target_col='class')
final_df = remove_outliers_iqr(final_df, features=FILTER_COLS + ['MJD', 'redshift'])

# Split data
X = final_df.drop(columns='class')
y = final_df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# ------------ Preprocessing pipeline ------------
scale_transform = ColumnTransformer([
    ('scale', StandardScaler(), make_column_selector(dtype_include='number'))
])

preprocess_pipe = Pipeline([
    ('feature_eng', FunctionTransformer(feature_engineer)),
    ('drop_cols', FunctionTransformer(lambda df: drop_columns(df, DROP_COLS))),
    ('scale', scale_transform),
    ('pca', PCA(n_components=0.95))  # Optional, remove if not needed
])

# ------------ Kernel-based models ------------
svc_rbf = Pipeline([
    ('preprocessing', preprocess_pipe),
    ('classifier', SVC(kernel='rbf', C=10, gamma='scale', probability=True))
])



# ------------ Add KNN (not kernel-based, but useful baseline) ------------
knn = Pipeline([
    ('preprocessing', preprocess_pipe),
    ('classifier', KNeighborsClassifier(n_neighbors=5))
])

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

stacking_clf = StackingClassifier(
    estimators=[
        ('svc', svc_rbf),
        ('knn', knn)
    ],
    final_estimator=LogisticRegression(),
    cv=5,
    n_jobs=-1
)

stacking_clf.fit(X_train, y_train)
y_pred = stacking_clf.predict(X_test)
print("Stacking Ensemble Report:\n")
print(classification_report(y_test, y_pred, target_names=le.classes_))


Stacking Ensemble Report:

              precision    recall  f1-score   support

      GALAXY       0.97      0.98      0.97      9515
         QSO       0.96      0.89      0.92      2395
        STAR       0.97      0.99      0.98      3449

    accuracy                           0.97     15359
   macro avg       0.96      0.95      0.96     15359
weighted avg       0.97      0.97      0.97     15359



In [16]:
df_test = pd.read_csv('test.csv')
test_prediction =grid_search.predict(df_test)
y_test_kaggle = pd.DataFrame(test_prediction, columns=["class"])
y_test_kaggle.index.name = "ID"
y_test_kaggle[['class']].to_csv("kaggle1.csv")

NotFittedError: This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.