In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt

# Q1: What is the difference between Ordinal Encoding and Label Encoding? Provide an example of when you might choose one over the other.

ordinal_vs_label_encoding_explanation = """
**Ordinal Encoding**: This encoding method assigns integers to categories based on their order. It is used when there is a meaningful order among categories.

**Label Encoding**: This method assigns integers to categories without assuming any order. It is used when there is no inherent ordinal relationship among categories.

*Example:*
- **Ordinal Encoding**: For a feature 'Education Level' with values ['High School', 'Bachelor', 'Master', 'PhD'], the encoding could be [0, 1, 2, 3].
- **Label Encoding**: For a feature 'Color' with values ['Red', 'Green', 'Blue'], the encoding could be [0, 1, 2] with no implied order.

*Choosing One Over the Other:*
- Use **Ordinal Encoding** for features with a natural order (e.g., education levels).
- Use **Label Encoding** for features without an inherent order (e.g., colors).
"""

# Q2: Explain how Target Guided Ordinal Encoding works and provide an example of when you might use it in a machine learning project.

target_guided_ordinal_encoding_explanation = """
**Target Guided Ordinal Encoding**: This method involves encoding categorical values based on the target variable's mean or median for each category. It creates an ordinal feature where the order reflects the target's relationship with the categories.

*Example:*
If you have a dataset predicting house prices with a categorical feature 'Neighborhood' and you want to encode it based on the average house price in each neighborhood, you would:
1. Calculate the average price for each neighborhood.
2. Assign ranks to neighborhoods based on these averages.

*When to Use:*
- When you have a categorical feature with a potential relationship to the target variable and want to leverage this relationship to improve model performance.
"""

# Q3: Define covariance and explain why it is important in statistical analysis. How is covariance calculated?

covariance_explanation = """
**Covariance**: Covariance is a measure of the relationship between two variables. It indicates how changes in one variable are associated with changes in another.

**Importance:**
- It helps in understanding the direction of the linear relationship between variables.
- Positive covariance indicates that variables move in the same direction, while negative covariance indicates that they move in opposite directions.

**Calculation:**
Covariance between two variables X and Y is calculated as:
Cov(X, Y) = Σ[(X_i - X̄) * (Y_i - Ŷ)] / (n - 1)
where X_i and Y_i are individual data points, X̄ and Ŷ are the means of X and Y, and n is the number of data points.
"""

# Q4: For a dataset with the following categorical variables: Color (red, green, blue), Size (small, medium, large), and Material (wood, metal, plastic), perform label encoding using Python's scikit-learn library. Show your code and explain the output.

data = pd.DataFrame({
    'Color': ['red', 'green', 'blue', 'green', 'red'],
    'Size': ['small', 'medium', 'large', 'medium', 'small'],
    'Material': ['wood', 'metal', 'plastic', 'metal', 'wood']
})

label_encoders = {}
encoded_data = pd.DataFrame()

for column in data.columns:
    le = LabelEncoder()
    encoded_data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

encoded_data

# Q5: Calculate the covariance matrix for the following variables in a dataset: Age, Income, and Education level. Interpret the results.

# Sample data
np.random.seed(0)
data_cov = pd.DataFrame({
    'Age': np.random.randint(20, 60, 100),
    'Income': np.random.randint(20000, 80000, 100),
    'Education': np.random.randint(1, 5, 100)  # Example: 1 = High School, 2 = Bachelor, 3 = Master, 4 = PhD
})

cov_matrix = data_cov.cov()
cov_matrix

# Q6: You are working on a machine learning project with a dataset containing several categorical variables, including "Gender" (Male/Female), "Education Level" (High School/Bachelor's/Master's/PhD), and "Employment Status" (Unemployed/Part-Time/Full-Time). Which encoding method would you use for each variable, and why?

encoding_method_explanation = """
**Encoding Methods for Categorical Variables:**
1. **Gender**: Use **Label Encoding** as it has only two categories (Male/Female) and is inherently binary.
2. **Education Level**: Use **Ordinal Encoding** as there is a meaningful order among the categories (e.g., High School < Bachelor's < Master's < PhD).
3. **Employment Status**: Use **One-Hot Encoding** as there is no inherent order and multiple categories need to be represented.

*Justification:*
- **Label Encoding** for binary categories simplifies encoding.
- **Ordinal Encoding** captures the ordinal relationship.
- **One-Hot Encoding** avoids implying any ordinal relationship among multiple categories.
"""

# Q7: You are analyzing a dataset with two continuous variables, "Temperature" and "Humidity", and two categorical variables, "Weather Condition" (Sunny/Cloudy/Rainy) and "Wind Direction" (North/South/East/West). Calculate the covariance between each pair of variables and interpret the results.

# Sample data
data_weather = pd.DataFrame({
    'Temperature': np.random.uniform(15, 35, 100),
    'Humidity': np.random.uniform(30, 80, 100),
    'Weather Condition': np.random.choice(['Sunny', 'Cloudy', 'Rainy'], 100),
    'Wind Direction': np.random.choice(['North', 'South', 'East', 'West'], 100)
})

# Encode categorical variables
weather_le = LabelEncoder()
data_weather['Weather Condition'] = weather_le.fit_transform(data_weather['Weather Condition'])
data_weather['Wind Direction'] = weather_le.fit_transform(data_weather['Wind Direction'])

# Calculate covariance matrix
cov_weather_matrix = data_weather.cov()
cov_weather_matrix
