In [2]:
import numpy as np
import pandas as pd

np.random.seed(42)
n_rows = 100

data = {
    'age': np.random.randint(10,99 , size=n_rows),
    'salary': np.random.randint(1000, 200000, size=n_rows),
    'increment': np.random.choice([3, 4, 5, 6], size=n_rows)
}

df = pd.DataFrame(data)
print(df.head())


   age  salary  increment
0   61    3568          6
1   24  121151          6
2   81  194664          5
3   70  199635          5
4   30  134767          4


In [4]:
df.loc[np.random.choice(df.index, 2, replace=False), "salary"] = np.nan
df.loc[np.random.choice(df.index, 3, replace=False), "Age"] = np.nan

print("Synthetic Dataset with NaN values:\n")
print(df.head(15))

Synthetic Dataset with NaN values:

    age    salary  increment  Age
0    61    3568.0          6  NaN
1    24  121151.0          6  NaN
2    81  194664.0          5  NaN
3    70  199635.0          5  NaN
4    30  134767.0          4  NaN
5    92  125375.0          6  NaN
6    96  180262.0          6  NaN
7    84  137330.0          6  NaN
8    84   40504.0          3  NaN
9    97  165231.0          6  NaN
10   33   14986.0          3  NaN
11   12   62858.0          4  NaN
12   31  130312.0          3  NaN
13   62   13666.0          4  NaN
14   11   39660.0          6  NaN


In [3]:
# Mean of income
mean_salary = df["salary"].mean(skipna=True)
# Median of income
median_salary = df["salary"].median(skipna=True)
# Age-weighted mean of income
# Formula: (Σ (Income * Age)) / (Σ Age)
age_weighted_mean = (df["salary"] * df["Age"]).sum(skipna=True) / df["Age"].sum(skipna=True)


  age_weighted_mean = (df["salary"] * df["Age"]).sum(skipna=True) / df["Age"].sum(skipna=True)


In [4]:
print("\nResults:")
print(f"Mean salary = {mean_salary:.3f}")
print(f"Median Income = {median_salary:.3f}")
print(f"Age-Weighted Mean salary = {age_weighted_mean:.3f}")


Results:
Mean salary = 253933.793
Median Income = 245947.000
Age-Weighted Mean salary = nan


:Q Explain when weighted mean is preferable.
Ans:Use weighted mean when some data should matter more than others.


Problem 2: Standardize income (z-score). Report how many incomes are outliers using rule |z|
> 3. Handle NaNs correctly (do not drop entire rows unnecessarily).


In [5]:
# Z-score standardization
mean_salary = df["salary"].mean(skipna=True)
std_salary = df["salary"].std(skipna=True)
# Compute z-scores for Income
df["salary_z"] = (df["salary"] - mean_salary) / std_salary
# Identify outliers using |z| > 3
outliers = df[(df["salary_z"].abs() > 3)]

In [6]:
print("\nStandardized Income (z-scores):\n")
print(df[[ "salary", "salary_z"]].head(15))

print(f"\nNumber of outliers = {outliers.shape[0]}")
print("\nOutlier Rows:\n")
print(outliers)


Standardized Income (z-scores):

      salary  salary_z
0   492879.0  1.635701
1    45397.0 -1.427540
2   262750.0  0.060351
3   358531.0  0.716021
4   268160.0  0.097386
5   488005.0  1.602336
6    60591.0 -1.323529
7   225884.0 -0.192015
8    28247.0 -1.544940
9        NaN       NaN
10  472281.0  1.494697
11   87798.0 -1.137283
12       NaN       NaN
13  353951.0  0.684668
14  279329.0  0.173843

Number of outliers = 0

Outlier Rows:

Empty DataFrame
Columns: [age, salary, increment, Age, salary_z]
Index: []


Problem 3: Create age bins: [18-25), [25-35), [35-45), [45-60) and compute for each bin:
● count of observations,
● mean income,
● median score.
Show result as a tidy DataFrame sorted by age bin.


In [7]:
bins = [15, 25, 34, 45, 55]
labels = ["15-25", "25-34", "34-45", "45-55"]

df["Age_Bin"] = pd.cut(df["Age"], bins=bins, labels=labels, right=False)
result = df.groupby("Age_Bin").agg(
    Count=("salary", "count"),
    Mean_Income=("salary", "mean"),
    Median_Income=("salary", "median")
).reset_index()
result = result.sort_values("Age_Bin").reset_index(drop=True)

  result = df.groupby("Age_Bin").agg(


In [8]:
print("\nResult by Age Bin:\n")
print(result)


Result by Age Bin:

  Age_Bin  Count  Mean_Income  Median_Income
0   15-25      0          NaN            NaN
1   25-34      0          NaN            NaN
2   34-45      0          NaN            NaN
3   45-55      0          NaN            NaN


Problem 4: Create an array it cannot be of 1 Dimension. And then showcase the operation for
the following:
● Shape and Resize → shape, size, Transpose, Flatten
● Showcasing negative indexing and display error while doing slicing
● Arithmetic Operations → Broadcasting, Dot Product
● Linear Algebra → Determinant, Inverse
                                                                                       

In [9]:
arr = np.array([[1, 2, 3],
                [4, 5, 6],
                [7, 8, 9]], dtype=float)
print("Original Array:\n", arr)

Original Array:
 [[1. 2. 3.]
 [4. 5. 6.]
 [7. 8. 9.]]


In [10]:
# Shape and Resize
print("\nShape of array:", arr.shape)
print("Size of array:", arr.size)
print("Transpose of array:\n", arr.T)
print("Flattened array:\n", arr.flatten())


Shape of array: (3, 3)
Size of array: 9
Transpose of array:
 [[1. 4. 7.]
 [2. 5. 8.]
 [3. 6. 9.]]
Flattened array:
 [1. 2. 3. 4. 5. 6. 7. 8. 9.]


In [11]:
# Negative Indexing
print("\nLast row using negative indexing:", arr[-1])
print("Last element using negative indexing:", arr[-1, -1])


Last row using negative indexing: [7. 8. 9.]
Last element using negative indexing: 9.0


In [12]:
# Error in slicing
try:
    print(arr[-7])
except IndexError as e:
    print("\nIndexError:", e)


IndexError: index -7 is out of bounds for axis 0 with size 3


In [13]:
# Arithmetic Operations
# Broadcasting (adding scalar)
print("\nBroadcasting (arr + 5):\n", arr + 5)

# Dot product (matrix multiplication)
dot_product = np.dot(arr, arr)
print("\nDot Product (arr x arr):\n", dot_product)

# Linear Algebra Operations
det = np.linalg.det(arr)
print("\nDeterminant:", det)

# Inverse (only if determinant != 0)
if det != 0:
    inv = np.linalg.inv(arr)
    print("Inverse:\n", inv)
else:
    print("Matrix is singular, inverse does not exist")


Broadcasting (arr + 5):
 [[ 6.  7.  8.]
 [ 9. 10. 11.]
 [12. 13. 14.]]

Dot Product (arr x arr):
 [[ 30.  36.  42.]
 [ 66.  81.  96.]
 [102. 126. 150.]]

Determinant: 0.0
Matrix is singular, inverse does not exist
