In [None]:
#  Explain the different types of data (qualitative and quantitative) and provide examples of each. Discuss nominal, ordinal, interval, and ratio scales.

Qualitative Data (Categorical): This data represents categories or labels that cannot be measured numerically.
Nominal Data: Categories without any inherent order (e.g., Gender, Nationality).
Ordinal Data: Categories with a meaningful order but no defined intervals (e.g., Education levels: High School, Bachelor's, Master's).
Quantitative Data (Numerical): Data that can be measured and expressed numerically.
Discrete Data: Can only take specific values (e.g., number of children).
Continuous Data: Can take any value within a given range (e.g., height, weight).
Code:
python

import pandas as pd

# Example data
data = pd.DataFrame({
    'Category': ['Red', 'Blue', 'Green', 'Red', 'Green'],
    'Age': [23, 45, 34, 25, 28]
})

# Nominal data (Category)
nominal_data = data['Category'].mode()

# Quantitative data (Age)
quantitative_data = data['Age'].mean()

print(f"Nominal Data (Most Frequent Category): {nominal_data}")
print(f"Mean of Quantitative Data (Age): {quantitative_data}")

In [None]:
#  What are the measures of central tendency, and when should you use each? Discuss the mean, median, and mode with examples and situations where each is appropriate.

 Measures of Central Tendency: Mean, Median, and Mode

Mean: The sum of all values divided by the number of values.
Use the mean when the data is symmetrically distributed with no outliers.
Median: The middle value in a sorted dataset.
Use the median when the data is skewed or has outliers.
Mode: The most frequent value in the dataset.
Use the mode for categorical data or identifying the most common value.
Code:
python

import numpy as np
import pandas as pd

# Example dataset
data = [1, 2, 2, 3, 4, 5, 6, 6, 6, 7]

# Mean
mean_value = np.mean(data)

# Median
median_value = np.median(data)

# Mode
mode_value = pd.Series(data).mode()

print(f"Mean: {mean_value}, Median: {median_value}, Mode: {mode_value[0]}")

In [None]:
# Explain the concept of dispersion. How do variance and standard deviation measure the spread of data

Dispersion: Variance and Standard Deviation

Variance: Measures the average squared deviation of each data point from the mean.
Formula:
Variance
=
1
𝑛
∑
(
𝑋
𝑖
−
𝜇
)
2
Variance=
n
1
​
 ∑(X
i
​
 −μ)
2

Standard Deviation: The square root of variance, providing a more interpretable measure of spread.
Formula:
Standard Deviation
=
Variance
Standard Deviation=
Variance
​


# Variance and Standard Deviation calculation
variance_value = np.var(data)
std_deviation_value = np.std(data)

print(f"Variance: {variance_value}, Standard Deviation: {std_deviation_value}")

In [None]:
# What is a box plot, and what can it tell you about the distribution of data?

 Box Plot: Distribution of Data

A box plot visually represents the distribution of a dataset. It shows:

Median (central line inside the box).
Interquartile range (IQR) (the length of the box).
Outliers (points outside the "whiskers").

import matplotlib.pyplot as plt

# Example dataset
data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# Creating box plot
plt.boxplot(data)
plt.title("Box Plot")
plt.show()

In [None]:
#  Discuss the role of random sampling in making inferences about populations.

Random Sampling and Inferences

Random sampling is the process of selecting a sample from a population such that each individual has an equal chance of being selected. This allows us to make inferences about the population.


import numpy as np

# Simulating a population
population = np.random.randint(1, 100, size=1000)

# Random sample of 100 from the population
sample = np.random.choice(population, size=100)

# Mean of the sample
sample_mean = np.mean(sample)

print(f"Sample Mean: {sample_mean}")

In [None]:
# Explain the concept of skewness and its types. How does skewness affect the interpretation of data?

 Skewness and Its Types

Positive Skew (Right Skew): The right tail of the distribution is longer than the left.
Negative Skew (Left Skew): The left tail is longer than the right.
Skewness affects the interpretation because the mean may be heavily influenced by extreme values, making it different from the median.


from scipy.stats import skew

# Example data with positive skew
data_skewed = [1, 2, 2, 3, 10]

# Calculating skewness
skewness_value = skew(data_skewed)

print(f"Skewness: {skewness_value}")

In [None]:
#  What is the interquartile range (IQR), and how is it used to detect outliers?

Interquartile Range (IQR) and Outliers

The IQR is the range between the first (Q1) and third (Q3) quartiles. Outliers are defined as values below
𝑄
1
−
1.5
×
𝐼
𝑄
𝑅
Q1−1.5×IQR or above
𝑄
3
+
1.5
×
𝐼
𝑄
𝑅
Q3+1.5×IQR.


# Calculate IQR
Q1 = np.percentile(data, 25)
Q3 = np.percentile(data, 75)
IQR = Q3 - Q1

# Detect outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = [x for x in data if x < lower_bound or x > upper_bound]

print(f"IQR: {IQR}, Outliers: {outliers}")

In [None]:
# Discuss the conditions under which the binomial distribution is used

 Binomial Distribution

The binomial distribution describes the probability of a certain number of successes in a fixed number of trials with two possible outcomes (success or failure). It's used when:

Each trial is independent.
The probability of success is constant across trials.

from scipy.stats import binom

# Parameters: n = 10 trials, p = 0.5 probability of success
n = 10
p = 0.5

# Probability of getting exactly 5 successes
probability = binom.pmf(5, n, p)

print(f"Probability of 5 successes: {probability}")

In [None]:
# Explain the properties of the normal distribution and the empirical rule (68-95-99.7 rule).

Normal Distribution and Empirical Rule

The normal distribution is a symmetric, bell-shaped curve. The Empirical Rule (68-95-99.7 rule) states that:

68% of data lies within one standard deviation of the mean.
95% lies within two standard deviations.
99.7% lies within three standard deviations.

import matplotlib.pyplot as plt
import numpy as np

# Generating a normal distribution
data_normal = np.random.normal(loc=0, scale=1, size=1000)

# Plotting the histogram
plt.hist(data_normal, bins=30, density=True)
plt.title("Normal Distribution")
plt.show()

# Checking the Empirical Rule
mean_value = np.mean(data_normal)
std_dev_value = np.std(data_normal)

within_one_std = len([x for x in data_normal if mean_value - std_dev_value <= x <= mean_value + std_dev_value]) / len(data_normal)
within_two_std = len([x for x in data_normal if mean_value - 2*std_dev_value <= x <= mean_value + 2*std_dev_value]) / len(data_normal)

print(f"Percentage within 1 std dev: {within_one_std*100}%")
print(f"Percentage within 2 std dev: {within_two_std*100}%")

In [None]:
# Provide a real-life example of a Poisson process and calculate the probability for a specific event.

Poisson Process Example

The Poisson distribution models the number of events occurring in a fixed interval of time or space, assuming the events happen independently and at a constant rate.


from scipy.stats import poisson

# Poisson distribution for lambda=3 (average events per interval)
lambda_value = 3
k = 2  # Number of events

# Probability of exactly 2 events
probability = poisson.pmf(k, lambda_value)

print(f"Probability of 2 events: {probability}")

In [None]:
# Explain what a random variable is and differentiate between discrete and continuous random variables.

Random Variables: Discrete vs. Continuous

Discrete Random Variable: Takes specific values (e.g., number of cars).
Continuous Random Variable: Takes any value within a given range (e.g., height).

# Discrete random variable (e.g., number of cars)
discrete_data = [0, 1, 2, 3, 4, 5]

# Continuous random variable (e.g., height)
height_data = np.random.uniform(150, 190, 1000)

# Calculating mean for discrete and continuous
discrete_mean = np.mean(discrete_data)
height_mean = np.mean(height_data)

print(f"Discrete Mean: {discrete_mean}, Continuous Mean: {height_mean}")

In [None]:
# Provide an example dataset, calculate both covariance and correlation, and interpret the results.

Covariance and Correlation

Covariance: Measures how two variables change together.
Positive covariance indicates that both variables increase together.
Negative covariance indicates that one variable increases as the other decreases.
Correlation: Standardized measure of the strength and direction of the relationship between two variables.

# Example datasets
X = [1, 2, 3, 4, 5]
Y = [5, 4, 3, 2, 1]

# Covariance
covariance_value = np.cov(X, Y)[0, 1]

# Correlation
correlation_value = np.corrcoef(X, Y)[0, 1]

print(f"Covariance: {covariance_value}, Correlation: {correlation_value}")