In [None]:
import numpy as np
import pandas as pd


np.random.seed(42)
n_rows = 100

data = {
    'age': np.random.randint(15, 60, size=n_rows),
    'salary': np.random.randint(10000, 40000, size=n_rows),
    'increment': np.random.choice([5, 6, 7, 8], size=n_rows)
}

df = pd.DataFrame(data)
print(df.head())


   age  salary  increment
0   53   10391          8
1   43   23986          7
2   29   39090          5
3   57   22666          6
4   22   15892          5


In [None]:
df.loc[np.random.choice(df.index, 5, replace=False), "salary"] = np.nan
df.loc[np.random.choice(df.index,8, replace=False), "Age"] = np.nan
print("Synthetic Dataset with NaN values:\n")
print(df.head(12))

Synthetic Dataset with NaN values:

    age   salary  increment  Age
0    53  10391.0          8  NaN
1    43  23986.0          7  NaN
2    29  39090.0          5  NaN
3    57  22666.0          6  NaN
4    22  15892.0          5  NaN
5    35  13561.0          5  NaN
6    53  36854.0          7  NaN
7    33  16184.0          6  NaN
8    37  29483.0          8  NaN
9    25      NaN          8  NaN
10   25  18392.0          7  NaN
11   38  23067.0          7  NaN


In [None]:
mean_salary = df["salary"].mean(skipna=True)   # default skipna=True
median_salary = df["salary"].median(skipna=True)
mask = df["salary"].notna() & df["Age"].notna()
age_weighted_mean = (
    (df.loc[mask, "salary"] * df.loc[mask, "Age"]).sum()
    / df.loc[mask, "Age"].sum()
)

  (df.loc[mask, "salary"] * df.loc[mask, "Age"]).sum()


In [None]:
print("\nResults:")
print(f"Mean salary = {mean_salary:.2f}")
print(f"Median Income = {median_salary:.2f}")
print(f"Age-Weighted Mean salary = {age_weighted_mean:.2f}")


Results:
Mean salary = 26330.69
Median Income = 26312.00
Age-Weighted Mean salary = nan


:Q Explain when weighted mean is preferable.
Ans:Use weighted mean when some data should matter more than others.


Problem 2: Standardize income (z-score). Report how many incomes are outliers using rule |z|
> 3. Handle NaNs correctly (do not drop entire rows unnecessarily).


In [None]:
# Z-score standardization
mean_salary = df["salary"].mean(skipna=True)
std_salary = df["salary"].std(skipna=True)
# Compute z-scores for Income
df["salary_z"] = (df["salary"] - mean_salary) / std_salary
# Identify outliers using |z| > 3
outliers = df[(df["salary_z"].abs() > 3)]

In [None]:
print("\nStandardized Income (z-scores):\n")
print(df[[ "salary", "salary_z"]].head(12))

print(f"\nNumber of outliers = {outliers.shape[0]}")
print("\nOutlier Rows:\n")
print(outliers)


Standardized Income (z-scores):

     salary  salary_z
0   10391.0 -1.912829
1   23986.0 -0.281373
2   39090.0  1.531169
3   22666.0 -0.439778
4   15892.0 -1.252686
5   13561.0 -1.532416
6   36854.0  1.262840
7   16184.0 -1.217645
8   29483.0  0.378290
9       NaN       NaN
10  18392.0 -0.952676
11  23067.0 -0.391657

Number of outliers = 0

Outlier Rows:

Empty DataFrame
Columns: [age, salary, increment, Age, salary_z]
Index: []


Problem 3: Create age bins: [18-25), [25-35), [35-45), [45-60) and compute for each bin:
● count of observations,
● mean income,
● median score.
Show result as a tidy DataFrame sorted by age bin.


In [None]:
bins = [15, 25, 35, 45, 60]
labels = ["15-25", "25-34", "34-45", "45-55"]

df["Age_Bin"] = pd.cut(df["Age"], bins=bins, labels=labels, right=False)
result = df.groupby("Age_Bin").agg(
    Count=("salary", "count"),
    Mean_Income=("salary", "mean"),
    Median_Income=("salary", "median")
).reset_index()
result = result.sort_values("Age_Bin").reset_index(drop=True)

  result = df.groupby("Age_Bin").agg(


In [None]:
print("\nResult by Age Bin:\n")
print(result)


Result by Age Bin:

  Age_Bin  Count  Mean_Income  Median_Income
0   15-25      0          NaN            NaN
1   25-34      0          NaN            NaN
2   34-45      0          NaN            NaN
3   45-55      0          NaN            NaN


Problem 4: Create an array it cannot be of 1 Dimension. And then showcase the operation for
the following:
● Shape and Resize → shape, size, Transpose, Flatten
● Showcasing negative indexing and display error while doing slicing
● Arithmetic Operations → Broadcasting, Dot Product
● Linear Algebra → Determinant, Inverse
                                                                                       

In [None]:
arr = np.array([[2, 6, 8],
                [6, 4, 2],
                [5, 7, 1]], dtype=float)
print("Original Array:\n", arr)

Original Array:
 [[2. 6. 8.]
 [6. 4. 2.]
 [5. 7. 1.]]


In [None]:
# Shape and Resize
print("\nShape of array:", arr.shape)
print("Size of array:", arr.size)
print("Transpose of array:\n", arr.T)
print("Flattened array:\n", arr.flatten())


Shape of array: (3, 3)
Size of array: 9
Transpose of array:
 [[2. 6. 5.]
 [6. 4. 7.]
 [8. 2. 1.]]
Flattened array:
 [2. 6. 8. 6. 4. 2. 5. 7. 1.]


In [None]:
# Negative Indexing
print("\nLast row using negative indexing:", arr[-1])
print("Last element using negative indexing:", arr[-1, -1])


Last row using negative indexing: [5. 7. 1.]
Last element using negative indexing: 1.0


In [None]:
# Error in slicing
try:
    print(arr[-7])
except IndexError as e:
    print("\nIndexError:", e)


IndexError: index -7 is out of bounds for axis 0 with size 3


In [None]:
# Arithmetic Operations
# Broadcasting (adding scalar)
print("\nBroadcasting (arr + 7):\n", arr + 7)

# Dot product (matrix multiplication)
dot_product = np.dot(arr, arr)
print("\nDot Product (arr x arr):\n", dot_product)

# Linear Algebra Operations
det = np.linalg.det(arr)
print("\nDeterminant:", det)

# Inverse (only if determinant != 0)
if det != 0:
    inv = np.linalg.inv(arr)
    print("Inverse:\n", inv)
else:
    print("Matrix is singular, inverse does not exist")


Broadcasting (arr + 7):
 [[ 9. 13. 15.]
 [13. 11.  9.]
 [12. 14.  8.]]

Dot Product (arr x arr):
 [[80. 92. 36.]
 [46. 66. 58.]
 [57. 65. 55.]]

Determinant: 180.0
Inverse:
 [[-0.05555556  0.27777778 -0.11111111]
 [ 0.02222222 -0.21111111  0.24444444]
 [ 0.12222222  0.08888889 -0.15555556]]
