In [26]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler

def preprocess_new_data(file_path):
    # Load new data
    df = pd.read_csv(file_path)
    print("New data loaded.")

    # Example Preprocessing
    if 'Genre' in df.columns:
        df['Genre'] = df['Genre'].map({'Male': 1, 'Female': 0})
        print("Genre column encoded.")
    
    if 'CustomerID' in df.columns:
        df.drop('CustomerID', axis=1, inplace=True)
        print("CustomerID column dropped.")

    # Select top features
    selected_features = ['Annual Income (k$)', 'Spending Score (1-100)']
    data = df[selected_features]

    # Scale the data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    print("Data scaled successfully.")
    
    return scaled_data, df

def predict_clusters(model_name, scaled_data, original_df):
    try:
        # Load the model
        model = joblib.load(f'{model_name}_best_model.pkl')
        print("MODEL::::",model)
        print(f"{model_name} model loaded successfully.")
        
        # Perform predictions
        if model_name == 'KMeans':
            cluster_labels = model.predict(scaled_data)
        elif model_name == 'Agglomerative':
            cluster_labels = model.fit_predict(scaled_data)
        elif model_name == 'DBSCAN':
            cluster_labels = model.fit_predict(scaled_data)
        else:
            print(f"Unsupported model: {model_name}")
            return

        # Add cluster labels to the original dataframe
        original_df[f'{model_name}_Cluster'] = cluster_labels
        print(f"Clusters predicted using {model_name} and labels added to the data.")
    except Exception as e:
        print(f"Error during prediction: {e}")

def save_results(df, output_path):
    df.to_csv(output_path, index=False)
    print(f"Results saved to {output_path}")

def main(model_name='KMeans'):
    new_data_path = 'Mall_Customers_prediction.csv'
    output_path = f'{model_name}_predicted_clusters.csv'
    
    scaled_data, original_df = preprocess_new_data(new_data_path)
    predict_clusters(model_name, scaled_data, original_df)
    save_results(original_df, output_path)

if __name__ == "__main__":
    # Change model_name to 'KMeans', 'DBSCAN', or 'Agglomerative' as needed
    main()


New data loaded.
Genre column encoded.
CustomerID column dropped.
Data scaled successfully.
MODEL:::: KMeans(n_clusters=5, random_state=42)
KMeans model loaded successfully.
Clusters predicted using KMeans and labels added to the data.
Results saved to KMeans_predicted_clusters.csv
