In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Q1: What is data encoding? How is it useful in data science?

data_encoding_explanation = """
**Data Encoding**: Data encoding is the process of converting categorical data into numerical formats. This is essential for machine learning algorithms that require numerical input for processing.

*Why It's Useful:*
1. **Compatibility**: Machine learning algorithms typically work with numerical data.
2. **Performance**: Encoded data can improve the performance and accuracy of models.
3. **Data Analysis**: Enables statistical analysis and mathematical operations on categorical features.
"""

# Q2: What is nominal encoding? Provide an example of how you would use it in a real-world scenario.

nominal_encoding_explanation = """
**Nominal Encoding**: Nominal encoding, also known as label encoding, assigns a unique integer to each category in a categorical feature. It is useful for representing categories as numerical values.

*Example:*
Consider a dataset with a categorical feature 'Color' with values ['Red', 'Green', 'Blue'].

*Nominal Encoding:*
- Red: 0
- Green: 1
- Blue: 2

*Real-World Scenario:*
In a dataset of customer feedback, if 'Customer Satisfaction Level' is categorical with values ['Low', 'Medium', 'High'], nominal encoding can be used to assign integers to these categories for analysis.
"""

# Q3: In what situations is nominal encoding preferred over one-hot encoding? Provide a practical example.

nominal_vs_onehot_explanation = """
**Nominal Encoding vs. One-Hot Encoding:**
- **Nominal Encoding** is preferred when there is a natural order or ranking in the categories, or when the number of categories is very high.
- **One-Hot Encoding** is preferred when there is no ordinal relationship and the number of categories is manageable.

*Example:*
In a dataset with a feature 'Education Level' ['High School', 'Bachelor', 'Master', 'PhD']:
- **Nominal Encoding**: Assign integers based on a ranking (if applicable).
- **One-Hot Encoding**: Create binary columns for each category.

If the dataset has hundreds of unique categories, nominal encoding might be more efficient to avoid a large number of binary columns.
"""

# Q4: Suppose you have a dataset containing categorical data with 5 unique values. Which encoding technique would you use to transform this data into a format suitable for machine learning algorithms? Explain why you made this choice.

encoding_technique_explanation = """
**Encoding Technique Choice:**
- **One-Hot Encoding** is generally preferred for a dataset with a moderate number of unique categorical values, as it avoids the potential ordinal implication of nominal encoding.

*Reason:*
- One-Hot Encoding will create 5 new binary columns, each representing one of the 5 unique values. This avoids introducing a numerical relationship that does not exist among the categorical values.

*Implementation:*
1. Use OneHotEncoder to transform categorical data into a binary matrix.
"""

# Q5: In a machine learning project, you have a dataset with 1000 rows and 5 columns. Two of the columns are categorical, and the remaining three columns are numerical. If you were to use nominal encoding to transform the categorical data, how many new columns would be created? Show your calculations.

num_unique_values_cat1 = 3  # Example number of unique values in the first categorical column
num_unique_values_cat2 = 4  # Example number of unique values in the second categorical column

# Calculate the number of new columns created by nominal encoding
num_new_columns_nominal = num_unique_values_cat1 + num_unique_values_cat2
num_new_columns_nominal

# Q6: You are working with a dataset containing information about different types of animals, including their species, habitat, and diet. Which encoding technique would you use to transform the categorical data into a format suitable for machine learning algorithms? Justify your answer.

animal_data_encoding_explanation = """
**Encoding Technique Choice:**
- **One-Hot Encoding** is suitable for features like 'Species', 'Habitat', and 'Diet', as these features are categorical without inherent ordinal relationships.

*Justification:*
- One-Hot Encoding will create binary columns for each unique category in these features, ensuring that the machine learning model does not assume any ordinal relationship among categories.
"""

# Q7: You are working on a project that involves predicting customer churn for a telecommunications company. You have a dataset with 5 features, including the customer's gender, age, contract type, monthly charges, and tenure. Which encoding technique(s) would you use to transform the categorical data into numerical data? Provide a step-by-step explanation of how you would implement the encoding.

customer_churn_encoding_explanation = """
**Encoding Technique Choice:**
1. **Gender**: Use **Nominal Encoding** (Label Encoding) since it has only two categories.
2. **Contract Type**: Use **One-Hot Encoding** as it may have more than two categories without an inherent order.

*Implementation Steps:*
1. **Label Encoding for Gender**:
   - Convert 'Male' to 0 and 'Female' to 1.
2. **One-Hot Encoding for Contract Type**:
   - Create binary columns for each unique contract type (e.g., 'Monthly', 'Annual', 'Two-Year').

*Python Implementation:*
```python
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Sample data
data = pd.DataFrame({
    'Gender': ['Male', 'Female', 'Female', 'Male'],
    'Contract Type': ['Monthly', 'Annual', 'Monthly', 'Two-Year']
})

# Label Encoding for Gender
label_encoder = LabelEncoder()
data['Gender_encoded'] = label_encoder.fit_transform(data['Gender'])

# One-Hot Encoding for Contract Type
one_hot_encoder = OneHotEncoder(sparse=False)
contract_encoded = one_hot_encoder.fit_transform(data[['Contract Type']])
contract_encoded_df = pd.DataFrame(contract_encoded, columns=one_hot_encoder.get_feature_names_out(['Contract Type']))

# Concatenate encoded features
data_encoded = pd.concat([data, contract_encoded_df], axis=1).drop(['Gender', 'Contract Type'], axis=1)
data_encoded
