In [5]:
import numpy as np
import pandas as pd

np.random.seed(42)
n_rows = 100

data = {
    'age': np.random.randint(18, 50, size=n_rows),
    'salary': np.random.randint(5000, 500000, size=n_rows),
    'increment': np.random.choice([2, 4, 3, 5], size=n_rows)
}

df = pd.DataFrame(data)
print(df.head())


   age  salary  increment
0   24  443974          2
1   37  470625          5
2   46  207283          5
3   32  201769          4
4   28   42065          2


In [6]:
df.loc[np.random.choice(df.index, 10, replace=False), "salary"] = np.nan
df.loc[np.random.choice(df.index, 7, replace=False), "Age"] = np.nan

print("Synthetic Dataset with NaN values:\n")
print(df.head(20))

Synthetic Dataset with NaN values:

    age    salary  increment  Age
0    24  443974.0          2  NaN
1    37  470625.0          5  NaN
2    46  207283.0          5  NaN
3    32  201769.0          4  NaN
4    28   42065.0          2  NaN
5    25  228165.0          3  NaN
6    46  104299.0          3  NaN
7    38  430822.0          2  NaN
8    24   16534.0          3  NaN
9    43       NaN          3  NaN
10   36   45397.0          2  NaN
11   40  262750.0          5  NaN
12   28       NaN          2  NaN
13   28  268160.0          5  NaN
14   41  488005.0          3  NaN
15   38   60591.0          3  NaN
16   21  225884.0          3  NaN
17   25   28247.0          4  NaN
18   41   29300.0          5  NaN
19   20  472281.0          4  NaN


In [12]:
# Mean of income
mean_salary = df["salary"].mean(skipna=True)
# Median of income
median_salary = df["salary"].median(skipna=True)
# Age-weighted mean of income
# Formula: (Σ (Income * Age)) / (Σ Age)
age_weighted_mean = (df["salary"] * df["Age"]).sum(skipna=True) / df["Age"].sum(skipna=True)


  age_weighted_mean = (df["salary"] * df["Age"]).sum(skipna=True) / df["Age"].sum(skipna=True)


In [13]:
print("\nResults:")
print(f"Mean salary = {mean_salary:.2f}")
print(f"Median Income = {median_salary:.2f}")
print(f"Age-Weighted Mean salary = {age_weighted_mean:.2f}")


Results:
Mean salary = 243596.90
Median Income = 224926.50
Age-Weighted Mean salary = nan


:Q Explain when a weighted mean is preferable.

Ans.Use weighted mean when some data points should matter more than others.

Problem 2: Standardize income (z-score). Report how many incomes are outliers using rule |z|> 3. Handle NaNs correctly (do not drop entire rows unnecessarily).

In [14]:
# Z-score standardization
mean_salary = df["salary"].mean(skipna=True)
std_salary = df["salary"].std(skipna=True)
# Compute z-scores for Income
df["salary_z"] = (df["salary"] - mean_salary) / std_salary
# Identify outliers using |z| > 3
outliers = df[(df["salary_z"].abs() > 3)]

In [15]:
print("\nStandardized Income (z-scores):\n")
print(df[[ "salary", "salary_z"]].head(15))

print(f"\nNumber of outliers = {outliers.shape[0]}")
print("\nOutlier Rows:\n")
print(outliers)


Standardized Income (z-scores):

      salary  salary_z
0   443974.0  1.347829
1   470625.0  1.527096
2   207283.0 -0.244264
3   201769.0 -0.281354
4    42065.0 -1.355597
5   228165.0 -0.103802
6   104299.0 -0.936982
7   430822.0  1.259362
8    16534.0 -1.527330
9        NaN       NaN
10   45397.0 -1.333184
11  262750.0  0.128833
12       NaN       NaN
13  268160.0  0.165223
14  488005.0  1.644002

Number of outliers = 0

Outlier Rows:

Empty DataFrame
Columns: [age, salary, increment, Age, salary_z]
Index: []


Problem 3: Create age bins: [18-25), [25-35), [35-45), [45-60) and compute for each bin: ● count of observations, ● mean income, ● median score. Show result as a tidy DataFrame sorted by age bin.

In [17]:
bins = [18, 25, 35, 45, 60]
labels = ["18-25", "25-35", "35-45", "45-60"]

df["Age_Bin"] = pd.cut(df["Age"], bins=bins, labels=labels, right=False)
result = df.groupby("Age_Bin").agg(
    Count=("salary", "count"),
    Mean_Income=("salary", "mean"),
    Median_Income=("salary", "median")
).reset_index()
result = result.sort_values("Age_Bin").reset_index(drop=True)

  result = df.groupby("Age_Bin").agg(


In [18]:
print("\nResult by Age Bin:\n")
print(result)


Result by Age Bin:

  Age_Bin  Count  Mean_Income  Median_Income
0   18-25      0          NaN            NaN
1   25-35      0          NaN            NaN
2   35-45      0          NaN            NaN
3   45-60      0          NaN            NaN


Problem 4: Create an array it cannot be of 1 Dimension. And then showcase the operation for the following: ● Shape and Resize → shape, size, Transpose, Flatten ● Showcasing negative indexing and display error while doing slicing ● Arithmetic Operations → Broadcasting, Dot Product ● Linear Algebra → Determinant, Inverse

In [19]:
arr = np.array([[1, 2, 3],
                [4, 5, 6],
                [7, 8, 9]], dtype=float)
print("Original Array:\n", arr)

Original Array:
 [[1. 2. 3.]
 [4. 5. 6.]
 [7. 8. 9.]]


In [20]:
# Shape and Resize
print("\nShape of array:", arr.shape)
print("Size of array:", arr.size)
print("Transpose of array:\n", arr.T)
print("Flattened array:\n", arr.flatten())


Shape of array: (3, 3)
Size of array: 9
Transpose of array:
 [[1. 4. 7.]
 [2. 5. 8.]
 [3. 6. 9.]]
Flattened array:
 [1. 2. 3. 4. 5. 6. 7. 8. 9.]


In [21]:
# Negative Indexing
print("\nLast row using negative indexing:", arr[-1])
print("Last element using negative indexing:", arr[-1, -1])


Last row using negative indexing: [7. 8. 9.]
Last element using negative indexing: 9.0


In [22]:
# Error in slicing
try:
    print(arr[-5])
except IndexError as e:
    print("\nIndexError:", e)


IndexError: index -5 is out of bounds for axis 0 with size 3


In [23]:
# Arithmetic Operations
# Broadcasting (adding scalar)
print("\nBroadcasting (arr + 5):\n", arr + 5)

# Dot product (matrix multiplication)
dot_product = np.dot(arr, arr)
print("\nDot Product (arr x arr):\n", dot_product)

# Linear Algebra Operations
det = np.linalg.det(arr)
print("\nDeterminant:", det)

# Inverse (only if determinant != 0)
if det != 0:
    inv = np.linalg.inv(arr)
    print("Inverse:\n", inv)
else:
    print("Matrix is singular, inverse does not exist")


Broadcasting (arr + 5):
 [[ 6.  7.  8.]
 [ 9. 10. 11.]
 [12. 13. 14.]]

Dot Product (arr x arr):
 [[ 30.  36.  42.]
 [ 66.  81.  96.]
 [102. 126. 150.]]

Determinant: 0.0
Matrix is singular, inverse does not exist
