In [None]:
import numpy as np
import random
import matplotlib.pyplot as plt

# Define the Network Segmentation Environment
class NetworkEnvironment:
    def __init__(self, n_segments=5):
        # Number of segments in the network
        self.n_segments = n_segments
        # Traffic volume and security levels for each segment (arbitrary values for simulation)
        self.traffic_volumes = np.random.randint(1, 100, n_segments)
        self.security_levels = np.random.randint(1, 10, n_segments)
    
    def get_state(self):
        # Return the state as a combination of traffic and security levels (just for simplicity)
        return np.concatenate([self.traffic_volumes, self.security_levels])
    
    def reset(self):
        self.traffic_volumes = np.random.randint(1, 100, self.n_segments)
        self.security_levels = np.random.randint(1, 10, self.n_segments)
        return self.get_state()
    
    def step(self, action):
        """
        Action is the segmentation strategy. 
        Action is represented as a list of indices where network segmentation occurs.
        Example: [0, 1, 2] means segments 0, 1, 2 are isolated.
        """
        # Calculate new state after applying segmentation (simplified)
        # For simplicity, let's assume traffic decreases with better segmentation.
        for i in action:
            self.traffic_volumes[i] = max(0, self.traffic_volumes[i] - 10)  # Decrease traffic in segmented segments
        
        # Calculate reward: higher reward for optimal traffic and security distribution
        reward = np.sum(self.traffic_volumes) - np.sum(self.security_levels)
        
        # New state after segmentation
        new_state = self.get_state()
        
        return new_state, reward

# Q-Learning Agent
class QLearningAgent:
    def __init__(self, n_actions, n_states, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.alpha = alpha  # learning rate
        self.gamma = gamma  # discount factor
        self.epsilon = epsilon  # exploration-exploitation tradeoff
        self.q_table = np.zeros((n_states, n_actions))
    
    def choose_action(self, state):
        """
        Choose action using epsilon-greedy strategy
        """
        if random.uniform(0, 1) < self.epsilon:
            # Exploration: Random action
            return random.randint(0, len(self.q_table[state]) - 1)
        else:
            # Exploitation: Action with the highest Q-value
            return np.argmax(self.q_table[state])
    
    def learn(self, state, action, reward, new_state):
        """
        Update Q-table using the Q-learning formula
        """
        best_future_q = np.max(self.q_table[new_state])
        current_q = self.q_table[state, action]
        self.q_table[state, action] = current_q + self.alpha * (reward + self.gamma * best_future_q - current_q)

# Training the Q-learning agent to optimize network segmentation
def train_network_segmentation(n_episodes=1000):
    env = NetworkEnvironment(n_segments=5)
    agent = QLearningAgent(n_actions=32, n_states=100)  # Simplified for a small state-action space

    rewards = []
    for episode in range(n_episodes):
        state = env.reset()
        total_reward = 0
        done = False
        for t in range(100):  # Max steps per episode
            action = agent.choose_action(state)
            # Convert action to segmentation strategy
            action_segments = [i for i in range(len(state)) if action & (1 << i)]
            new_state, reward = env.step(action_segments)
            agent.learn(state, action, reward, new_state)
            total_reward += reward
            state = new_state
        rewards.append(total_reward)
    
    # Plot rewards to see the learning process
    plt.plot(range(n_episodes), rewards)
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.title('Reinforcement Learning for Network Segmentation Optimization')
    plt.show()

# Main function to train and evaluate the agent
if __name__ == "__main__":
    train_network_segmentation(n_episodes=1000)