In [6]:
import numpy as np
import pandas as pd
import numpy.linalg as la

# Set global seed
np.random.seed(42)


In [7]:
# Number of samples
n = 120

# Generate ages (18–59)
ages = np.random.randint(18, 60, size=n).astype(float)

# Generate incomes (lognormal → scaled)
incomes = np.random.lognormal(mean=10.5, sigma=0.8, size=n) / 1000

# Generate scores (normal distribution)
scores = np.random.normal(loc=70, scale=12, size=n)

# Introduce NaN values
rng = np.random.default_rng(42)
ages[rng.choice(n, 4, replace=False)] = np.nan
incomes[rng.choice(n, 12, replace=False)] = np.nan
scores[rng.choice(n, 8, replace=False)] = np.nan

# Create DataFrame
df = pd.DataFrame({
    "age": ages,
    "income": incomes,
    "score": scores
})
df.index.name = "id"

df.head()


Unnamed: 0_level_0,age,income,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,56.0,120.132118,68.510411
1,46.0,47.597951,78.88409
2,32.0,539.197393,64.570453
3,25.0,17.390639,79.324598
4,38.0,26.399208,82.546854


In [8]:
# (a) Mean (ignore NaN)
mean_income = df["income"].mean()

# (b) Median (ignore NaN)
median_income = df["income"].median()

# (c) Age-weighted mean (remove rows where age/income is NaN)
mask = df["income"].notna() & df["age"].notna()
weighted_mean_income = np.average(df.loc[mask, "income"],
                                  weights=df.loc[mask, "age"])

print("Mean Income:", mean_income)
print("Median Income:", median_income)
print("Age-Weighted Mean Income:", weighted_mean_income)


Mean Income: 55.99891492478127
Median Income: 38.88200866264148
Age-Weighted Mean Income: 55.333566501511605


In [9]:
# Compute z-scores (NaNs remain NaN)
mu = df["income"].mean()
sigma = df["income"].std(ddof=0)

df["income_z"] = (df["income"] - mu) / sigma

# Outlier count |z| > 3
outliers = df["income_z"].abs() > 3
outlier_count = outliers.sum()

outlier_count, df.loc[outliers].head()


(np.int64(3),
       age      income      score  income_z
 id                                        
 2    32.0  539.197393  64.570453  7.375844
 81   42.0  276.877424  85.879166  3.371628
 112  52.0  323.688764  76.620844  4.086185)

In [10]:
# Define bins
bins = [18, 25, 35, 45, 60]
labels = ["[18-25)", "[25-35)", "[35-45)", "[45-60)"]

df["age_bin"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)

# Group summary
summary = df.groupby("age_bin").agg(
    count=("age", lambda x: x.notna().sum()),
    mean_income=("income", lambda x: x.mean(skipna=True)),
    median_score=("score", lambda x: x.median(skipna=True))
).reset_index()

summary


  summary = df.groupby("age_bin").agg(


Unnamed: 0,age_bin,count,mean_income,median_score
0,[18-25),21,52.284805,73.70953
1,[25-35),29,67.528696,71.036066
2,[35-45),30,52.750284,70.659219
3,[45-60),36,51.033718,71.542612


In [11]:
# Create a 3×3 matrix
np.random.seed(42)
A = np.random.randint(1, 10, size=(3,3)).astype(float)

print("Matrix A:\n", A)
print("\nShape:", A.shape)
print("Size:", A.size)
print("Transpose:\n", A.T)
print("Flatten:\n", A.ravel())
print("\nNegative Indexing A[-1, -1]:", A[-1, -1])

# Demonstrate slicing error
try:
    A[:, ::0]   # invalid slice (step=0)
except Exception as e:
    print("\nSlicing Error:", e)

# Broadcasting
row_vec = np.array([1,2,3])
print("\nBroadcasting (A + row_vec):\n", A + row_vec)

# Dot product
print("\nDot Product (A @ A.T):\n", A.dot(A.T))

# Determinant
detA = la.det(A)
print("\nDeterminant:", detA)

# Inverse
if abs(detA) > 1e-9:
    print("\nInverse of A:\n", la.inv(A))
else:
    print("\nMatrix is singular; inverse cannot be computed.")


Matrix A:
 [[7. 4. 8.]
 [5. 7. 3.]
 [7. 8. 5.]]

Shape: (3, 3)
Size: 9
Transpose:
 [[7. 5. 7.]
 [4. 7. 8.]
 [8. 3. 5.]]
Flatten:
 [7. 4. 8. 5. 7. 3. 7. 8. 5.]

Negative Indexing A[-1, -1]: 5.0

Slicing Error: slice step cannot be zero

Broadcasting (A + row_vec):
 [[ 8.  6. 11.]
 [ 6.  9.  6.]
 [ 8. 10.  8.]]

Dot Product (A @ A.T):
 [[129.  87. 121.]
 [ 87.  83. 106.]
 [121. 106. 138.]]

Determinant: -11.00000000000003

Inverse of A:
 [[-1.         -4.          4.        ]
 [ 0.36363636  1.90909091 -1.72727273]
 [ 0.81818182  2.54545455 -2.63636364]]


In [12]:
df.head(10)


Unnamed: 0_level_0,age,income,score,income_z,age_bin
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,56.0,120.132118,68.510411,0.978969,[45-60)
1,46.0,47.597951,78.88409,-0.128238,[45-60)
2,32.0,539.197393,64.570453,7.375844,[25-35)
3,25.0,17.390639,79.324598,-0.589341,[25-35)
4,38.0,26.399208,82.546854,-0.451828,[35-45)
5,56.0,34.589614,65.894299,-0.326805,[45-60)
6,36.0,11.672602,58.887441,-0.676625,[35-45)
7,40.0,83.616261,,0.421568,[35-45)
8,28.0,74.818914,78.521311,0.28728,[25-35)
9,28.0,36.871725,71.109746,-0.291969,[25-35)
