In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler

# Load the dataset
data = pd.read_csv('Space_data1.csv')

# Handle missing values
data["RCS_SIZE"] = data["RCS_SIZE"].fillna("Unknown")

# Remove rows where OBJECT_TYPE is 'TBA'
data = data[data['OBJECT_TYPE'] != 'TBA']

# Label encoding for categorical columns
categorical_columns = ['OBJECT_TYPE', 'RCS_SIZE','SITE']
label_encoders = {}

for column in categorical_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Convert 'EPOCH' and 'LAUNCH_DATE' to Unix timestamps
data['EPOCH'] = pd.to_datetime(data['EPOCH'])
data['LAUNCH_DATE'] = pd.to_datetime(data['LAUNCH_DATE'], format='%Y')

# Convert to Unix timestamps (seconds since 1970)
data['EPOCH_TIMESTAMP'] = data['EPOCH'].astype('int64') // 10**9
data['LAUNCH_DATE_TIMESTAMP'] = data['LAUNCH_DATE'].astype('int64') // 10**9

# Drop the original datetime columns
data = data.drop(columns=['EPOCH', 'LAUNCH_DATE'])

# Initialize StandardScaler
scaler = StandardScaler()

# List of features selected by mutual information
selected_features = [
    'SITE', 'INCLINATION', 'LAUNCH_DATE_TIMESTAMP', 'SEMIMAJOR_AXIS',
    'MEAN_MOTION', 'PERIOD', 'APOAPSIS', 'PERIAPSIS', 'ECCENTRICITY', 'RCS_SIZE'
]

# Check if selected features are in the DataFrame
for feature in selected_features:
    if feature not in data.columns:
        raise ValueError(f"Feature '{feature}' is not in the dataset.")

# Apply scaling to numerical features
numerical_columns = [
    'INCLINATION', 'LAUNCH_DATE_TIMESTAMP', 'SEMIMAJOR_AXIS', 'MEAN_MOTION',
    'PERIOD', 'APOAPSIS', 'PERIAPSIS', 'ECCENTRICITY'
]

data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Set 'OBJECT_TYPE' as the target variable and selected features as the feature set
X = data[selected_features]  # Features
y = data['OBJECT_TYPE']  # Target variable

# Apply class balancing technique (Random Oversampling)
ros = RandomOverSampler(random_state=42)
X_bal, y_bal = ros.fit_resample(X, y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_bal, y_bal, test_size=0.3, random_state=42)

# Define and train the Random Forest model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = model.predict(X_test)


# Save the model and label encoders to pickle files
with open('random_forest_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('label_encoders.pkl', 'wb') as le_file:
    pickle.dump(label_encoders, le_file)

with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

