In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from scipy.sparse import save_npz
import os

# Step 1: Load the Excel data
try:
    df = pd.read_excel('../datathon-project/data/Satellite_data.xlsx')
except FileNotFoundError:
    print("Error: The file 'Satellite_data.xlsx' was not found.")
    print("Please make sure you are in the correct directory.")
    # Exit or provide a clear path forward
    exit()

# Step 2: Fill missing values in TLE data with an empty string
df['TLE_line_1'] = df['TLE_LINE_1'].fillna('')
df['TLE_line_2'] = df['TLE_LINE_2'].fillna('')
df['TLE_data'] = df['TLE_line_1'] + ' ' + df['TLE_line_2']

# Step 3: Vectorize the textual TLE data using TF-IDF
# This step creates a memory-efficient sparse matrix
vectorizer = TfidfVectorizer(max_features=10000)
tle_features_sparse = vectorizer.fit_transform(df['TLE_data'])

# Step 4: One-hot encode categorical features and convert to sparse matrix
categorical_features = ['country_code', 'object_type']
df_categorical = df[categorical_features]
df_categorical_dummies = pd.get_dummies(df_categorical, columns=categorical_features, sparse=True)

# Step 5: Convert numerical features to a sparse matrix
numerical_features = ['period', 'inclination']
df_numerical = df[numerical_features]
from scipy.sparse import csr_matrix
numerical_features_sparse = csr_matrix(df_numerical.values)

# Step 6: Horizontally stack all sparse features into a single matrix
all_features_matrix = hstack([tle_features_sparse, df_categorical_dummies, numerical_features_sparse])

# Step 7: Save the pre-processed data in a memory-efficient format
output_path = '../datathon-project/data/preprocessed_satellite_data.npz'
try:
    save_npz(output_path, all_features_matrix)
    print(f"Pre-processed data saved successfully to '{output_path}'")
except Exception as e:
    print(f"Error saving file: {e}")