In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB

 clusters and corresponding counties
clusters = {
    0: ['San Francisco', 'Santa Clara', 'Santa Cruz', 'San Mateo', 'Alameda'],
    1: ['Contra Costa', 'Fresno', 'Kern', 'San Joaquin', 'Ventura'],
    2: ['Los Angeles'],
    3: ['Riverside', 'Sacramento', 'San Bernardino'],
    4: ['Orange', 'San Diego'],
    5: ['Butte', 'El Dorado', 'Humboldt', 'Imperial', 'Kings', 'Lake', 'Madera', 
        'Mendocino', 'Merced', 'Napa', 'Nevada', 'San Benito', 'Shasta', 'Sutter', 
        'Tehama', 'Yolo', 'Yuba'],
    6: ['Marin', 'Monterey', 'Placer', 'San Luis', 'Santa Barbara', 'Solano', 
        'Sonoma', 'Stanislaus', 'Tulare']
}

#Plan details per provider
plan_details = {
    'Verizon': [
        {'name': 'Verizon_basic1', 'budget': 30, 'speed': 100},
        {'name': 'Verizon_basic2', 'budget': 40, 'speed': 200},
        {'name': 'Verizon_premium1', 'budget': 50, 'speed': 400},
        {'name': 'Verizon_premium2', 'budget': 90, 'speed': 500}
    ],
    'AT&T': [
        {'name': 'AT&T_basic1', 'budget': 40, 'speed': 150},
        {'name': 'AT&T_basic2', 'budget': 50, 'speed': 200},
        {'name': 'AT&T_premium1', 'budget': 85, 'speed': 400},
        {'name': 'AT&T_premium2', 'budget': 90, 'speed': 500}
    ],
    'Xfinity': [
        {'name': 'Xfinity_basic1', 'budget': 30, 'speed': 150},
        {'name': 'Xfinity_basic2', 'budget': 40, 'speed': 250},
        {'name': 'Xfinity_premium1', 'budget': 60, 'speed': 500},
        {'name': 'Xfinity_premium2', 'budget': 80, 'speed': 600}
    ],
    'Viasat': [
        {'name': 'Viasat_basic1', 'budget': 30, 'speed': 100},
        {'name': 'Viasat_basic2', 'budget': 40, 'speed': 150},
        {'name': 'Viasat_premium1', 'budget': 60, 'speed': 400},
        {'name': 'Viasat_premium2', 'budget': 90, 'speed': 500}
    ],
    'HughesNet': [
        {'name': 'HughesNet_basic1', 'budget': 30, 'speed': 100},
        {'name': 'HughesNet_basic2', 'budget': 40, 'speed': 150},
        {'name': 'HughesNet_premium1', 'budget': 60, 'speed': 300},
        {'name': 'HughesNet_premium2', 'budget': 90, 'speed': 400}
    ]
}

# Dummy data
np.random.seed(42)
age_brackets = ['18-24', '25-44', '45-64', '65+']
occupation_categories = ['student', 'it_employee', 'home', 'other']
device_categories = ['1-4', '5-10', '11-16']

# Created dummy data for the counties and providers
data = pd.DataFrame({
    'age': np.random.choice(age_brackets, 500),
    'occupation': np.random.choice(occupation_categories, 500),
    'county': np.random.choice(sum(clusters.values(), []), 500),
    'devices': np.random.choice(device_categories, 500),
    'provider': np.random.choice(list(plan_details.keys()), 500)
})

# Encoded the categorical features
label_encoders = {}
for col in ['age', 'occupation', 'county', 'devices', 'provider']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Prepared the data 
X = data[['age', 'occupation', 'county', 'devices']]
y = data['provider']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardized
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Trained the  Gaussian Naive Bayes model
model = GaussianNB()
model.fit(X_train, y_train)


def get_cluster(county):
    for cluster_id, counties in clusters.items():
        if county in counties:
            return cluster_id
    return None

# Converted age to corresponding age bracket
def age_to_bracket(age):
    age = int(age)
    if age < 18:
        return None  # Invalid age
    elif age <= 24:
        return '18-24'
    elif age <= 44:
        return '25-44'
    elif age <= 64:
        return '45-64'
    else:
        return '65+'

# Mapped  the  input to known categories
def map_devices_to_category(devices):
    devices = devices.strip()
    if devices in device_categories:
        return devices
    if devices.isdigit():  # Handle a single number
        if int(devices) <= 4:
            return '1-4'
        elif int(devices) <= 10:
            return '5-10'
        else:
            return '11-16'
    raise ValueError("Invalid devices input. Please enter a valid number (e.g., '4') or a range (e.g., '1-4').")

# Recommendation system using Gaussian Naive Bayes
def recommend_plans(age, devices, occupation, county):
    # Convert the numerical age to the appropriate bracket
    age_bracket = age_to_bracket(age)
    if age_bracket is None:
        return "Invalid age. Must be 18 or older."
    
    
    county = county.title()  

    # Checking if the county exists in the training data
    if county not in label_encoders['county'].classes_:
        return f"County '{county}' not recognized. Please check the spelling or enter a valid county."
    
    # Map devices input to a known category
    try:
        devices_mapped = map_devices_to_category(devices)
    except ValueError as e:
        return str(e)
    
    # Transformed inputs
    age_encoded = label_encoders['age'].transform([age_bracket])[0]
    occupation_encoded = label_encoders['occupation'].transform([occupation])[0]
    county_encoded = label_encoders['county'].transform([county])[0]
    devices_encoded = label_encoders['devices'].transform([devices_mapped])[0]
    
    # Get cluster for the county
    cluster = get_cluster(county)
    
    # Input for prediction
    input_data = pd.DataFrame({
        'age': [age_encoded],
        'occupation': [occupation_encoded],
        'county': [county_encoded],
        'devices': [devices_encoded]
    })

    input_data = scaler.transform(input_data)
    
    # Predicted provider probabilities
    provider_probs = model.predict_proba(input_data)[0]
    providers = list(label_encoders['provider'].inverse_transform(range(len(provider_probs))))
    
    # Get the best provider by predicted probability
    recommended_plans = []
    
    for provider, prob in zip(providers, provider_probs):
        # Select plans based on the cluster
        if cluster == 2:  # High budget for cluster 2
            budget_range = (80, 90)
        elif cluster in [3, 4]:  # Medium budget for cluster 3 and 4
            budget_range = (50, 60)
        elif cluster in [5, 6]:  # Low budget for cluster 5 and 6
            budget_range = (30, 40)
        else:  # Default budget range
            budget_range = (40, 50)

        # Select plans from the provider that fit the budget range
        plans = [p for p in plan_details[provider] if budget_range[0] <= p['budget'] <= budget_range[1]]
        if plans:
            best_plan = max(plans, key=lambda p: p['speed'])  # Choose the plan with the highest speed
            best_plan['provider'] = provider
            best_plan['probability'] = round(prob * 100, 2)  # Convert probability to percentage
            recommended_plans.append(best_plan)

    
    return recommended_plans[:3]

# Function to run the recommendation system
def run_recommendation_system():
    # Collect user inputs
    age = input("Enter your age: ")
    devices = input("Enter number of devices: ")
    occupation = input("Enter your occupation: ")
    county = input("Enter your county: ")

    # Get recommendations
    recommended_plans = recommend_plans(age, devices, occupation, county)

    # Print the recommended plans
    if isinstance(recommended_plans, str):
        print(recommended_plans)  # Print error message if returned
    else:
        for plan in recommended_plans:
            print(f"Provider: {plan['provider']}, Plan name: {plan['name']}, "
                  f"Plan budget: ${plan['budget']}, Plan speed: {plan['speed']} Mbps, "
                  f"Probability: {plan['probability']}%")

# Run the recommendation system
run_recommendation_system()
