In [1]:
!pip install xgboost



In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
import warnings

In [2]:
x_df = pd.read_excel("x.xlsx")
y_df = pd.read_excel("y.xlsx")

In [3]:
x_df.head()

Unnamed: 0,Atmospheric Density,Surface Temperature,Gravity,Water Content,Mineral Abundance,Orbital Period,Proximity to Star,Magnetic Field Strength,Radiation Levels,Atmospheric Composition Index
0,-1.459426,2.890268,0.148757,-0.804439,0.494875,0.04491,-0.438796,6,10,0.407941
1,-2.971646,-0.648251,-0.915859,0.255504,-0.537165,-2.072251,1.355523,8,7,1.876232
2,-3.306354,-0.316716,-0.431264,0.389815,-1.961216,-1.510182,0.538593,8,7,0.934055
3,-0.752712,-2.492542,-1.072433,-2.561734,1.158838,-1.262638,1.447444,14,8,0.726009
4,1.129258,-3.333453,-4.423914,-1.020409,0.71129,-0.606784,0.047264,13,12,-1.630504


In [4]:
x_df.isnull().sum()

Atmospheric Density              0
Surface Temperature              0
Gravity                          0
Water Content                    0
Mineral Abundance                0
Orbital Period                   0
Proximity to Star                0
Magnetic Field Strength          0
Radiation Levels                 0
Atmospheric Composition Index    0
dtype: int64

## Applying Various Model

In [5]:
# Extract features and target
X = x_df  # Feature set
y = y_df["Prediction"]  # Target variable

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



In [6]:


# Define models with fixed Logistic Regression parameters
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),  # Increased max_iter
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boost": GradientBoostingClassifier(),
    "Adaboost": AdaBoostClassifier(),
     "Xgboost":XGBClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)  # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred)
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted')
    model_train_precision = precision_score(y_train, y_train_pred, average='weighted')
    model_train_recall = recall_score(y_train, y_train_pred, average='weighted')

    # Fix roc_auc_score for multiclass
    if len(set(y_train)) > 2:
        model_train_rocauc_score = roc_auc_score(y_train, model.predict_proba(X_train), multi_class='ovr')
        model_test_rocauc_score = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
    else:
        model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)
        model_test_rocauc_score = roc_auc_score(y_test, y_test_pred)

    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred)
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted')
    model_test_precision = precision_score(y_test, y_test_pred, average='weighted')
    model_test_recall = recall_score(y_test, y_test_pred, average='weighted')

    print(f"{name}")
    print("Model performance for Training set")
    print(f"- Accuracy: {model_train_accuracy:.4f}")
    print(f"- F1 score: {model_train_f1:.4f}")
    print(f"- Precision: {model_train_precision:.4f}")
    print(f"- Recall: {model_train_recall:.4f}")
    print(f"- Roc Auc Score: {model_train_rocauc_score:.4f}")
    print('-' * 40)
    
    print("Model performance for Test set")
    print(f"- Accuracy: {model_test_accuracy:.4f}")
    print(f"- F1 score: {model_test_f1:.4f}")
    print(f"- Precision: {model_test_precision:.4f}")
    print(f"- Recall: {model_test_recall:.4f}")
    print(f"- Roc Auc Score: {model_test_rocauc_score:.4f}")
    print("=" * 50, "\n")


Logistic Regression
Model performance for Training set
- Accuracy: 0.7980
- F1 score: 0.7973
- Precision: 0.7970
- Recall: 0.7980
- Roc Auc Score: 0.9759
----------------------------------------
Model performance for Test set
- Accuracy: 0.8035
- F1 score: 0.8028
- Precision: 0.8027
- Recall: 0.8035
- Roc Auc Score: 0.9769

Decision Tree
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------------
Model performance for Test set
- Accuracy: 0.8149
- F1 score: 0.8149
- Precision: 0.8153
- Recall: 0.8149
- Roc Auc Score: 0.8960

Random Forest
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------------
Model performance for Test set
- Accuracy: 0.9046
- F1 score: 0.9046
- Precision: 0.9048
- Recall: 0.9046
- Roc Auc Score: 0.9931

Gradient Boost
Model performance f

In [7]:
dt_classifier = DecisionTreeClassifier()

In [8]:
dt_classifier.fit(X_train, y_train) 

In [9]:
  # Make predictions
y_train_pred = dt_classifier.predict(X_train)
y_test_pred = dt_classifier.predict(X_test)

In [10]:
# Training set performance
model_train_accuracy = accuracy_score(y_train, y_train_pred)
model_train_f1 = f1_score(y_train, y_train_pred, average='weighted')
model_train_precision = precision_score(y_train, y_train_pred, average='weighted')
model_train_recall = recall_score(y_train, y_train_pred, average='weighted')

# Fix roc_auc_score for multiclass
if len(set(y_train)) > 2:
        model_train_rocauc_score = roc_auc_score(y_train, model.predict_proba(X_train), multi_class='ovr')
        model_test_rocauc_score = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
else:
        model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)
        model_test_rocauc_score = roc_auc_score(y_test, y_test_pred)

# Test set performance
model_test_accuracy = accuracy_score(y_test, y_test_pred)
model_test_f1 = f1_score(y_test, y_test_pred, average='weighted')
model_test_precision = precision_score(y_test, y_test_pred, average='weighted')
model_test_recall = recall_score(y_test, y_test_pred, average='weighted')

print(f"{name}")
print("Model performance for Training set")
print(f"- Accuracy: {model_train_accuracy:.4f}")
print(f"- F1 score: {model_train_f1:.4f}")
print(f"- Precision: {model_train_precision:.4f}")
print(f"- Recall: {model_train_recall:.4f}")
print(f"- Roc Auc Score: {model_train_rocauc_score:.4f}")
print('-' * 40)
    
print("Model performance for Test set")
print(f"- Accuracy: {model_test_accuracy:.4f}")
print(f"- F1 score: {model_test_f1:.4f}")
print(f"- Precision: {model_test_precision:.4f}")
print(f"- Recall: {model_test_recall:.4f}")
print(f"- Roc Auc Score: {model_test_rocauc_score:.4f}")
print("=" * 50, "\n")

Xgboost
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------------
Model performance for Test set
- Accuracy: 0.8146
- F1 score: 0.8144
- Precision: 0.8145
- Recall: 0.8146
- Roc Auc Score: 0.9950



In [11]:

import pickle

# Save the trained Decision Tree model to a pickle file
model_path = "decision_tree_model.pkl"

with open(model_path, "wb") as model_file:
    pickle.dump(dt_classifier, model_file)

print(f"Model saved as {model_path}")


Model saved as decision_tree_model.pkl
