<a href="https://colab.research.google.com/github/RafaaAli/Beyondinfinity/blob/main/K2_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

!pip install pandas numpy scikit-learn xgboost lightgbm imbalanced-learn matplotlib seaborn requests



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import lightgbm as lgb
import joblib

In [None]:
# uploding  K2 cumulative dataset from NASA Exoplanet Archive
df = pd.read_csv('/content/k2pandc_2025.10.03_23.01.39.csv', comment='#', on_bad_lines='skip')

In [None]:

print("=" * 80)
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.shape[1]}, Rows: {df.shape[0]}")
print("=" * 80)

Dataset shape: (4758, 129)
Columns: 129, Rows: 4758


In [None]:
# Find target column
possible_targets = ['k2_disposition', 'disposition', 'k2c_disp', 'k2_disp',
                    'koi_disposition', 'pl_disposition']
target_col = next((col for col in possible_targets if col in df.columns), None)

if not target_col:
    raise ValueError(f"Target column not found. Available columns: {df.columns.tolist()}")

print(f"\nTarget: {target_col}")
print(df[target_col].value_counts())

df = df[df[target_col] != 'REFUTED']
print(f"\nAfter dropping REFUTED:")
print(df[target_col].value_counts())

# Define feature groups
feature_groups = {
    'transit': ['k2_period', 'k2_duration', 'k2_depth', 'k2_rprs'],
    'orbital': ['k2_sma', 'k2_incl', 'k2_eccen', 'k2_impact'],
    'planetary': ['k2_prad', 'k2_teq', 'k2_insol'],
    'stellar': ['k2_srad', 'k2_steff', 'k2_slogg', 'k2_smet', 'k2_kepmag', 'k2_smass'],
    'quality': ['k2_sage', 'ra', 'dec']
}

confirmed_df = df[df[target_col] == 'CONFIRMED'].sample(n=2000, random_state=42)
other_df = df[df[target_col] != 'CONFIRMED']

# Combine
df = pd.concat([confirmed_df, other_df], ignore_index=True)

print(f"\nAfter downsampling CONFIRMED to 2000:")
print(df[target_col].value_counts())

# Exclude non-feature columns
exclude_cols = [target_col, 'rowid', 'pl_name', 'hostname', 'epic_name',
                'sy_snum', 'sy_pnum', 'disc_year', 'disc_facility']

# Select ALL numeric columns as features
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
available_features = [col for col in numeric_cols if col not in exclude_cols]

print(f"\n{'=' * 80}")
print(f"USING {len(available_features)} NUMERIC FEATURES:")
print(f"{'=' * 80}")
for i, feat in enumerate(available_features, 1):
    print(f"{i:2}. {feat}")





Target: disposition
disposition
CONFIRMED         2000
CANDIDATE         1374
FALSE POSITIVE     292
REFUTED             43
Name: count, dtype: int64

After dropping REFUTED:
disposition
CONFIRMED         2000
CANDIDATE         1374
FALSE POSITIVE     292
Name: count, dtype: int64

After downsampling CONFIRMED to 2000:
disposition
CONFIRMED         2000
CANDIDATE         1374
FALSE POSITIVE     292
Name: count, dtype: int64

USING 88 NUMERIC FEATURES:
 1. default_flag
 2. sy_mnum
 3. rv_flag
 4. pul_flag
 5. ptv_flag
 6. tran_flag
 7. ast_flag
 8. obm_flag
 9. micro_flag
10. etv_flag
11. ima_flag
12. dkin_flag
13. pl_controv_flag
14. pl_orbper
15. pl_orbsmax
16. pl_rade
17. pl_radj
18. pl_masse
19. pl_massj
20. pl_msinie
21. pl_bmasse
22. pl_bmassj
23. pl_dens
24. pl_orbeccen
25. pl_insol
26. pl_eqt
27. pl_orbincl
28. pl_tranmid
29. ttv_flag
30. pl_imppar
31. pl_trandep
32. pl_trandur
33. pl_ratdor
34. pl_ratror
35. pl_occdep
36. pl_orbtper
37. pl_orblper
38. pl_rvamp
39. pl_projobliq

In [None]:
df_model = df[[target_col] + available_features].copy()

# Remove rows with missing target
df_model = df_model.dropna(subset=[target_col])

# Check missing values
missing_pct = (df_model.isnull().sum() / len(df_model)) * 100
print(f"\nMissing value percentages:")
print(missing_pct[missing_pct > 0].sort_values(ascending=False))

# Drop columns with >50% missing values
high_missing = missing_pct[missing_pct > 50].index.tolist()
if target_col in high_missing:
    high_missing.remove(target_col)
if high_missing:
    df_model = df_model.drop(columns=high_missing)
    print(f"\nDropped {len(high_missing)} columns with >50% missing values")

# Separate features and target
X = df_model.drop(columns=[target_col])
y = df_model[target_col]

print(f"\nFinal feature count: {X.shape[1]}")
print(f"Final dataset size: {X.shape[0]} rows")

# Impute remaining missing values with median
for col in X.columns:
    if X[col].isnull().sum() > 0:
        X[col].fillna(X[col].median(), inplace=True)



Missing value percentages:
sy_icmag           100.000000
pl_occdep           99.972722
pl_trueobliq        99.809056
pl_orbtper          99.427169
pl_projobliq        99.372613
                      ...    
pul_flag             0.681942
pl_controv_flag      0.681942
tran_flag            0.681942
ast_flag             0.681942
obm_flag             0.681942
Length: 87, dtype: float64

Dropped 31 columns with >50% missing values

Final feature count: 57
Final dataset size: 3666 rows


In [None]:
# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y)
print(f"\nTarget classes: {le.classes_}")


Target classes: ['CANDIDATE' 'CONFIRMED' 'FALSE POSITIVE']


In [None]:
# ============================================================================
# 6. TRAIN-TEST SPLIT
# ============================================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.25, random_state=42, stratify=y_encoded
)

print(f"\nTrain size: {X_train.shape[0]}, Test size: {X_test.shape[0]}")



Train size: 2749, Test size: 917


In [None]:
# ============================================================================
# 7. FEATURE SCALING
# ============================================================================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ============================================================================
# 8. HANDLE CLASS IMBALANCE WITH SMOTE
# ============================================================================
class_counts = np.bincount(y_train)
imbalance_ratio = max(class_counts) / min(class_counts)


np.float64(6.8493150684931505)

In [None]:
def evaluate_model(name, model, X_test, y_test, y_pred):
    """Evaluate and print model metrics"""
    print(f"\n{'=' * 80}")
    print(f"{name} PERFORMANCE")
    print(f"{'=' * 80}")
    print(f"Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"Recall:    {recall_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1-Score:  {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"\n{classification_report(y_test, y_pred, target_names=le.classes_)}")


In [None]:
# --- Random Forest ---
print("\n[1/3] Training Random Forest...")
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features='sqrt',
    class_weight='balanced',
    oob_score=True,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)
evaluate_model("RANDOM FOREST", rf_model, X_test_scaled, y_test, y_pred_rf)
joblib.dump(rf_model, 'random_forest_model.pkl')
print("✓ Saved: random_forest_model.pkl")



[1/3] Training Random Forest...

RANDOM FOREST PERFORMANCE
Accuracy:  0.9084
Precision: 0.9073
Recall:    0.9084
F1-Score:  0.9077

                precision    recall  f1-score   support

     CANDIDATE       0.89      0.87      0.88       344
     CONFIRMED       0.94      0.96      0.95       500
FALSE POSITIVE       0.80      0.75      0.77        73

      accuracy                           0.91       917
     macro avg       0.87      0.86      0.87       917
  weighted avg       0.91      0.91      0.91       917

✓ Saved: random_forest_model.pkl


In [None]:
print("\n[2/3] Training XGBoost...")
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=3,
    gamma=0.1,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42,
    n_jobs=-1,
    eval_metric='mlogloss'
)
xgb_model.fit(X_train_scaled, y_train)
y_pred_xgb = xgb_model.predict(X_test_scaled)
evaluate_model("XGBOOST", xgb_model, X_test_scaled, y_test, y_pred_xgb)
joblib.dump(xgb_model, 'xgboost_model.pkl')
print("✓ Saved: xgboost_model.pkl")



[2/3] Training XGBoost...

XGBOOST PERFORMANCE
Accuracy:  0.9248
Precision: 0.9250
Recall:    0.9248
F1-Score:  0.9232

                precision    recall  f1-score   support

     CANDIDATE       0.92      0.89      0.90       344
     CONFIRMED       0.93      0.98      0.95       500
FALSE POSITIVE       0.95      0.73      0.82        73

      accuracy                           0.92       917
     macro avg       0.93      0.86      0.89       917
  weighted avg       0.93      0.92      0.92       917

✓ Saved: xgboost_model.pkl


In [None]:
# --- LightGBM ---
print("\n[3/3] Training LightGBM...")
lgb_model = lgb.LGBMClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_samples=20,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)
lgb_model.fit(X_train_scaled, y_train)
y_pred_lgb = lgb_model.predict(X_test_scaled)
evaluate_model("LIGHTGBM", lgb_model, X_test_scaled, y_test, y_pred_lgb)
joblib.dump(lgb_model, 'lightgbm_model.pkl')
print("✓ Saved: lightgbm_model.pkl")


[3/3] Training LightGBM...

LIGHTGBM PERFORMANCE
Accuracy:  0.9280
Precision: 0.9281
Recall:    0.9280
F1-Score:  0.9266

                precision    recall  f1-score   support

     CANDIDATE       0.93      0.89      0.91       344
     CONFIRMED       0.93      0.98      0.96       500
FALSE POSITIVE       0.93      0.74      0.82        73

      accuracy                           0.93       917
     macro avg       0.93      0.87      0.90       917
  weighted avg       0.93      0.93      0.93       917

✓ Saved: lightgbm_model.pkl


In [None]:
# ============================================================================
# 10. SAVE PREPROCESSING OBJECTS
# ============================================================================
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(le, 'label_encoder.pkl')
print("\n Saved: scaler.pkl")
print(" Saved: label_encoder.pkl")

# Save feature names for future use
joblib.dump(X.columns.tolist(), 'feature_names.pkl')
print(" Saved: feature_names.pkl")



 Saved: scaler.pkl
 Saved: label_encoder.pkl
 Saved: feature_names.pkl
