In [13]:
import numpy as np
from sklearn.metrics.pairwise import cosine_distances
import plotly.graph_objects as go
import pandas as pd

In [14]:
# Load dataset
file_path = "Resources/part-00000-3d57ee90-8dc9-4f89-97e6-768aa0ffce3c-c000.csv"  # Update with your actual file path
df = pd.read_csv(file_path)

In [15]:
# Normalize the features for cosine similarity
features = ['Star_Temperature_K', 'Star_Radius_Solar', 'Star_Mass_Solar']
X = df[features].values
X_normalized = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

In [16]:
def rank_by_similarity(X_normalized, target_index=0):
    # Compute cosine distance
    distances = cosine_distances(X_normalized[target_index].reshape(1, -1), X_normalized)
    
    # Sort by distance (similarity in reverse since lower distance means more similar)
    sorted_indices = np.argsort(distances[0])
    
    # Return sorted names with their similarity scores (1 - distance for similarity)
    return list(zip(df['Host_Star'].iloc[sorted_indices], [1 - dist for dist in distances[0][sorted_indices]]))


In [17]:

def plot_similarity_ranking(rankings):
    # Convert rankings to DataFrame for easier plotting with Plotly
    rankings_df = pd.DataFrame(rankings, columns=['Host_Star', 'Similarity_Score'])

    # Interactive Plot with Plotly
    fig = go.Figure(data=[
        go.Bar(
            x=rankings_df['Host_Star'],
            y=rankings_df['Similarity_Score'],
            text=rankings_df['Similarity_Score'],
            textposition='auto',
            hoverinfo='text+x',
            hovertext=[f"Similarity: {score:.3f}" for score in rankings_df['Similarity_Score']]
        )
    ])

    # Update layout for better readability
    fig.update_layout(
        title="Star Similarity Ranking",
        xaxis_title="Stars",
        yaxis_title="Similarity Score",
        xaxis_tickangle=-45,
        height=600
    )

    # Show the plot
    fig.show()

In [18]:
# Example usage
rankings = rank_by_similarity(X_normalized)
plot_similarity_ranking(rankings)