Question 1: Understanding the Dataset 
<br>
Description: Load a dataset and understand its basic properties including data types dimensions, and first few rows

In [None]:
# Write your code from here
import pandas as pd
import seaborn as sns

df = sns.load_dataset("iris")
print(df.head())
print(df.shape)
print(df.dtypes)
print(df.info())
print(df.describe())


Question 2: Checking for Missing Values
<br>
Description: Identify missing values in the dataset.

In [None]:
# Write your code from here
print(df.isnull().sum())
print(df.isnull().sum().sum())


Question 3: Descriptive Statistics
<br>
Description: Calculate descriptive statistics for numerical columns.

In [None]:
# Write you code from here
print(df.describe())


Question 4: Handling Outliers
<br>
Description: Identify outliers in numerical columns using box plots.

In [None]:
# Write your code from here
import seaborn as sns
import matplotlib.pyplot as plt

for column in df.select_dtypes(include='number').columns:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=df[column])
    plt.title(f'Box plot of {column}')
    plt.show()



Question 5: Categorical Data Analysis
<br>
Description: Explore the counts of categorical variables.

In [None]:
# Write your code from here
for column in df.select_dtypes(include=['object', 'category']).columns:
    print(f"Value counts for {column}:")
    print(df[column].value_counts())
    print()


Question 6: Data Transformation
<br>
Description: Transform a categorical column into numerical using Label Encoding.

In [None]:
# Write your code from here
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['species_encoded'] = label_encoder.fit_transform(df['species'])
print(df[['species', 'species_encoded']].head())


Question 7: Visualizing Data Distributions
<br>
Description: Plot histograms for numerical columns to understand distributions.

In [None]:
# Write your code from here
import matplotlib.pyplot as plt

df.hist(bins=15, figsize=(10, 8))
plt.tight_layout()
plt.show()


Question 8: Correlation Analysis
<br>
Description: Calculate and visualize the correlation matrix for numerical features.

In [None]:
# Write your code from here
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Generate a sample dataframe with numerical features
np.random.seed(0)
data = np.random.rand(100, 5)
df = pd.DataFrame(data, columns=['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'])

# Calculate the correlation matrix
correlation_matrix = df.corr()

# Print the correlation matrix
print("Correlation Matrix:\n", correlation_matrix)

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Numerical Features')
plt.show()


Question 9: Feature Engineering
<br>
Description: Create a new feature by combining or transforming existing features.

In [None]:
# Write your code from here
import pandas as pd
import numpy as np

# Assuming you have a DataFrame named 'df' with columns like
# 'sepal_length', 'sepal_width', 'petal_length', 'petal_width'

# If you don't have such a DataFrame, you can create a sample one:
data = {'sepal_length': [5.1, 4.9, 4.7, 4.6, 5.0],
        'sepal_width': [3.5, 3.0, 3.2, 3.1, 3.6],
        'petal_length': [1.4, 1.4, 1.3, 1.5, 1.4],
        'petal_width': [0.2, 0.2, 0.2, 0.2, 0.2]}
df = pd.DataFrame(data)

# Create a ratio of petal length to petal width
df['petal_ratio'] = df['petal_length'] / df['petal_width']
print(df[['petal_length', 'petal_width', 'petal_ratio']].head())

# Create the square of sepal length
df['sepal_length_squared'] = df['sepal_length'] ** 2
print(df[['sepal_length', 'sepal_length_squared']].head())

# You could also create interaction terms by multiplying features
df['sepal_petal_interaction'] = df['sepal_length'] * df['petal_length']
print(df[['sepal_length', 'petal_length', 'sepal_petal_interaction']].head())


Question 10: Advanced Outlier Detection
<br>
Description: Use the Z-score method to identify and handle outliers.

In [None]:
# Write your code from here
from scipy.stats import zscore

z_scores = df.select_dtypes(include='number').apply(zscore)
outliers = (z_scores > 3) | (z_scores < -3)
print(df[outliers.any(axis=1)])

df_no_outliers = df[~outliers.any(axis=1)]
print(df_no_outliers.shape)
