In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from scipy.stats import zscore

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder,StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier


In [8]:

def remove_minus_9999(df, features):
    return df[~(df[features] == -9999).any(axis=1)].copy()

def impute_u_by_class(df, target_col='class'):
    means = df.groupby(target_col)['u'].mean()
    for cls, mean_val in means.items():
        mask = (df[target_col] == cls) & (df['u'].isnull())
        df.loc[mask, 'u'] = mean_val
    return df

def remove_outliers_iqr(df, features, threshold=2.5):
    for col in features:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - threshold * IQR
        upper = Q3 + threshold * IQR
        df = df[(df[col] >= lower) & (df[col] <= upper)]
    return df.copy()

def feature_engineer(df):
    df = df.copy()
    df['u_g'] = df['u'] - df['g']
    df['g_r'] = df['g'] - df['r']
    df['r_i'] = df['r'] - df['i']
    df['i_z'] = df['i'] - df['z']
    return df

def drop_columns(df, cols):
    return df.drop(columns=cols, errors='ignore')

DROP_COLS = ['rerun_ID', 'obj_ID', 'spec_obj_ID']
FILTER_COLS = ['u', 'g', 'r', 'i', 'z']



In [9]:
final_df = pd.read_csv('train.csv')

le = LabelEncoder()
final_df['class'] = le.fit_transform(final_df['class'])

final_df = remove_minus_9999(final_df, FILTER_COLS)
final_df = impute_u_by_class(final_df, target_col='class')
final_df = remove_outliers_iqr(final_df, features=FILTER_COLS + ['MJD', 'redshift'])

In [10]:
# Split data
X = final_df.drop(columns='class')
y = final_df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [11]:
scale_transform = ColumnTransformer([
    ('scale', StandardScaler(), make_column_selector(dtype_include='number'))
])

preprocess_pipe = Pipeline([
    ('feature_eng', FunctionTransformer(feature_engineer)),
    ('drop_cols', FunctionTransformer(lambda df: drop_columns(df, DROP_COLS))),
    ('scale', scale_transform)
])


In [None]:
# === Pipeline 1: PCA + SVC ===
pipe_svc = Pipeline([
    ('preprocessing', preprocess_pipe),
    ('pca', PCA(n_components=0.95)),
    ('svc', SVC(kernel='rbf', probability=True, random_state=42))
])

params_svc = {
    'svc__C': [1, 5, 10],
    'svc__gamma': ['scale', 'auto']
}

grid_svc = GridSearchCV(pipe_svc, params_svc, cv=5, scoring='f1_macro', n_jobs=-1, verbose=1)
grid_svc.fit(X_train, y_train)
print("Best PCA+SVC Score:", grid_svc.best_score_)
print("Best PCA+SVC Params:", grid_svc.best_params_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best PCA+SVC Score: 0.9587121804674691
Best PCA+SVC Params: {'svc__C': 10, 'svc__gamma': 'scale'}


This gets 0.959 on Kaggle.

In [None]:
# === Pipeline 2: Feature Selection + Logistic Regression ===
# Pipeline with regularized Logistic Regression
pipe_lr = Pipeline([
    ('preprocessing', preprocess_pipe),
    ('lr', LogisticRegression(max_iter=1000, solver='saga', random_state=42)),

])

# Grid: regularization strength + penalty type
params_lr = {
    'lr__C': [0.01, 0.1, 1.0, 10],
    'lr__penalty': ['l1', 'l2']
}

# GridSearch with macro F1 scoring
grid_lr = GridSearchCV(pipe_lr, param_grid=params_lr, cv=5, scoring='f1_macro', n_jobs=-1)
grid_lr.fit(X_train, y_train)

# Results
print("Best Logistic Regression F1 Macro:", grid_lr.best_score_)
print("Best Parameters:", grid_lr.best_params_)

Best Logistic Regression F1 Macro: 0.947074130539957
Best Parameters: {'lr__C': 10, 'lr__penalty': 'l1'}




Score 0.955

In [None]:
# === Base classifiers (with tuned parameters)
logreg = LogisticRegression(max_iter=1000, solver='saga', C=10, penalty='l1', random_state=42)
dt = DecisionTreeClassifier(max_depth=8, min_samples_split=4, min_samples_leaf=2, random_state=42)
rf = RandomForestClassifier(n_estimators=150, max_depth=16, min_samples_split=2, random_state=42, n_jobs=-1)
svm = SVC(kernel='rbf', C=10, gamma='scale', probability=True, random_state=42)

# === Voting Classifier (hard voting)
vote_clf = VotingClassifier(
    estimators=[('lr', logreg), ('dt', dt), ('rf', rf), ('svm', svm)],
    voting='hard',
    n_jobs=-1
)

# === Pipeline
pipeline_vote = Pipeline([
    ('pre', preprocess_pipe),
    ('model', vote_clf)
])

# === Fit and evaluate
pipeline_vote.fit(X, y)
y_pred = pipeline_vote.predict(X)
f1 = f1_score(y, y_pred, average='macro')
print("VotingClassifier F1 Macro (on full data):", f1)

VotingClassifier Best F1 Macro: 0.9671512267609101
Best Parameters: {'model__voting': 'hard'}


This gets 0.974 on kaggle.

In [34]:
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold

# === Define Base Estimators ===
logreg = LogisticRegression(max_iter=1000, solver='saga', C=10, penalty='l1', random_state=42)
dt = DecisionTreeClassifier(max_depth=8, min_samples_split=4, min_samples_leaf=2, random_state=42)
rf = RandomForestClassifier(n_estimators=150, max_depth=16, min_samples_split=2, random_state=42, n_jobs=-1)
svm = SVC(kernel='rbf', C=10, gamma='scale', probability=True, random_state=42)


# === Define Stacking Classifier ===
stack = StackingClassifier(
    estimators=[
        ('lr', logreg),
        ('dt', dt),
        ('rf', rf),
        ('svm', svm)
    ],
    final_estimator=LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42),
    passthrough=False,
    cv=3,
    n_jobs=-1
)

# === Pipeline with Preprocessing + Model ===
pipeline = Pipeline([
    ('pre', preprocess_pipe),  
    ('model', stack)
])

pipeline.fit(X,y)
y_pred = pipeline.predict(X_test)
f1 = f1_score(y_test, y_pred, average='macro')
print("VotingClassifier F1 Macro (on full data):", f1)

VotingClassifier F1 Macro (on full data): 0.9908211896764384


score 0.977

In [33]:
# === Pipelines with Regularized Models
pipe_lr = Pipeline([
    ('pre', preprocess_pipe),
    ('clf', LogisticRegression(max_iter=1000, solver='saga', C=10, penalty='l1', random_state=42))
])

pipe_svm = Pipeline([
    ('pre', preprocess_pipe),
    ('clf', SVC(kernel='rbf', C=10, gamma='scale', probability=True, random_state=42))
])

# === Soft Voting Classifier
vote = VotingClassifier(
    estimators=[('lr', pipe_lr), ('svm', pipe_svm)],
    voting='soft',
    n_jobs=-1
)

# === Fit and Evaluate
vote.fit(X, y)
y_pred = vote.predict(X)
f1 = f1_score(y, y_pred, average='macro')
print("VotingClassifier F1 Macro (on full data):", f1)

VotingClassifier F1 Macro (on full data): 0.9649428341669468


score 0.967


In [32]:
from sklearn.ensemble import GradientBoostingClassifier
# === Gradient Boosting Pipeline
gb_pipe = Pipeline([
    ('pre', preprocess_pipe),
    ('gb', GradientBoostingClassifier(random_state=42))
])

param_grid = {
    'gb__n_estimators': [100, 200],
    'gb__max_depth': [3, 5]
}

grid_gb = GridSearchCV(gb_pipe, param_grid=param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
grid_gb.fit(X_train, y_train)

print("Best F1 Macro:", grid_gb.best_score_)
print("Best Params:", grid_gb.best_params_)

Best F1 Macro: 0.97159078833285
Best Params: {'gb__max_depth': 5, 'gb__n_estimators': 200}


score 0.975

In [42]:
df_test = pd.read_csv('test.csv')
test_prediction = pipeline.predict(df_test)
y_test_kaggle = pd.DataFrame(test_prediction, columns=["class"])
y_test_kaggle.index.name = "ID"
y_test_kaggle[['class']].to_csv("kaggle1.csv")