# Encoding Methods Comparative Analysis

> Add blockquote



## Objective
Analyze and compare different encoding techniques for categorical variables.

In [46]:
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

## 1. Base Encoder Class
Implement your encoding methods in this class

In [48]:
class EncodingAnalyzer:

    def __init__(self):

        """Initialize your encoding tracker here"""
        # TODO: Add necessary initialization
        self.label_encoder = LabelEncoder()

    def basic_ascii_encode(self, text: str) -> int:
        """Implement basic ASCII encoding"""
        # TODO: Implement method
        ascii_sum = 0
        for char in text:
          ascii_sum +=  ord(char)


        return ascii_sum

    def weighted_ascii_encode(self, text: str) -> int:
        """Implement weighted ASCII encoding"""
        # TODO: Implement method

        weighted_ascii_sum = 0

        for i, char in enumerate(text):
            weighted_ascii_sum += (i + 1) * ord(char)
        return weighted_ascii_sum


    def hash_based_encode(self, text: str) -> int:
        """Implement your custom hash encoding"""
        # TODO: Implement method
        hash_value = hash(text) % 10000
        return hash_value


    def label_encode(self, text: str) -> int:
        """Implement sklearn label encoding"""
        # TODO: Implement method
        label_encoded = self.label_encoder.fit_transform([text])
        return label_encoded[0]



## 2. Performance Analysis Tools
Implement your analysis methods here

In [49]:
class PerformanceAnalyzer:
    def measure_time(self, func, data):
        """Measure encoding time"""
        # TODO: Implement timing
        start_time = time()
        func(data)
        end_time = time()
        return end_time - start_time

    def measure_memory(self, encoded_data):
        """Measure memory usage"""
        # TODO: Implement memory measurement
        memory_usage = encoded_data.nbytes
        return memory_usage



    def check_collisions(self, encoded_data, original_data):
        """Find encoding collisions"""
        # TODO: Implement collision detection
        unique_encoded = np.unique(encoded_data)
        unique_original = np.unique(original_data)
        collision_count = len(unique_encoded) - len(unique_original)
        return collision_count


## 3. Test Scenarios
Test your implementations with these cases

In [50]:
test_scenarios = {
    'basic': ['cat', 'act', 'tac'],
    'length': ['short', 'very_long_string_test'],
    'special': ['hello!', 'hello?', 'hello.'],
    'case': ['Test', 'test', 'TEST'],
    'numeric': ['user1', '1user', 'user_1']
}

# TODO: Run your tests here

encoding_analyzer = EncodingAnalyzer()
performance_analyzer = PerformanceAnalyzer()


for scenario, texts in test_scenarios.items():
    print(f"Scenario: {scenario}")

    for text in texts:
        print(f"\nTesting text: {text}")

        # Basic ASCII Encode
        ascii_encoded = encoding_analyzer.basic_ascii_encode(text)
        print(f"Basic ASCII Encoding: {ascii_encoded}")

        # Weighted ASCII Encode
        weighted_ascii_encoded =encoding_analyzer.weighted_ascii_encode(text)
        print(f"Weighted ASCII Encoding: {weighted_ascii_encoded}")

        # Hash-based Encode
        hash_encoded = encoding_analyzer.hash_based_encode(text)
        print(f"Hash-based Encoding: {hash_encoded}")

        # Label Encode
        label_encoded = encoding_analyzer.label_encode(text)
        print(f"Label Encoding: {label_encoded}")

        """Measure encoding time"""
        time_taken_basic = performance_analyzer.measure_time(encoding_analyzer.basic_ascii_encode, text)
        time_taken_weighted = performance_analyzer.measure_time(encoding_analyzer.weighted_ascii_encode, text)
        time_taken_hash = performance_analyzer.measure_time(encoding_analyzer.hash_based_encode, text)
        time_taken_label = performance_analyzer.measure_time(encoding_analyzer.label_encode, text)

        print(f"Time taken for Basic ASCII Encoding: {time_taken_basic:.6f} seconds")
        print(f"Time taken for Weighted ASCII Encoding: {time_taken_weighted:.6f} seconds")
        print(f"Time taken for Hash-based Encoding: {time_taken_hash:.6f} seconds")
        print(f"Time taken for Label Encoding: {time_taken_label:.6f} seconds")

        """Measure memory usage"""
        encoded_data = np.array([ascii_encoded, weighted_ascii_encoded, hash_encoded, label_encoded])
        memory_usage = performance_analyzer.measure_memory(encoded_data)
        print(f"Memory Usage (in bytes): {memory_usage}")

        """Find encoding collisions"""
        original_data = [text]
        collisions = performance_analyzer.check_collisions(encoded_data, original_data)
        print(f"Number of collisions: {collisions}\n")


Scenario: basic

Testing text: cat
Basic ASCII Encoding: 312
Weighted ASCII Encoding: 641
Hash-based Encoding: 6943
Label Encoding: 0
Time taken for Basic ASCII Encoding: 0.000005 seconds
Time taken for Weighted ASCII Encoding: 0.000005 seconds
Time taken for Hash-based Encoding: 0.000003 seconds
Time taken for Label Encoding: 0.000203 seconds
Memory Usage (in bytes): 32
Number of collisions: 3


Testing text: act
Basic ASCII Encoding: 312
Weighted ASCII Encoding: 643
Hash-based Encoding: 5409
Label Encoding: 0
Time taken for Basic ASCII Encoding: 0.000005 seconds
Time taken for Weighted ASCII Encoding: 0.000004 seconds
Time taken for Hash-based Encoding: 0.000002 seconds
Time taken for Label Encoding: 0.000145 seconds
Memory Usage (in bytes): 32
Number of collisions: 3


Testing text: tac
Basic ASCII Encoding: 312
Weighted ASCII Encoding: 607
Hash-based Encoding: 5914
Label Encoding: 0
Time taken for Basic ASCII Encoding: 0.000004 seconds
Time taken for Weighted ASCII Encoding: 0.0000

## 4. Results Visualization
Create your performance comparison visualizations

In [None]:
def plot_results(results):
    """Create performance comparison plots"""
    # TODO: Implement visualization





## 5. Analysis Template

Complete this analysis for each encoding method:

1. Implementation Complexity:
   - Lines of code:
   - Time to implement:
   - Key challenges:

2. Performance Metrics:
   - Average encoding time:
   - Memory footprint:
   - Collision rate:

3. Advantages:
   - [Your findings]

4. Disadvantages:
   - [Your findings]

5. Edge Cases:
   - [Your findings]

6. Real-world Applicability:
   - [Your analysis]