In [44]:
import numpy as np

# Student scores dataset (rows = students, columns = subjects)
# Columns: [Math, Science, English, History, Art]
scores = np.array([
    [78, 85, 90, 66, 59],
    [92, 88, 95, 77, 72],
    [63, 70, 65, 80, 75],
    [88, 92, 94, 85, 83],
    [70, 68, 75, 60, 55],
    [82, 79, 88, 90, 91],
    [56, 60, 55, 50, 45],
    [90, 95, 98, 89, 84],
])


## Basic Array Practice

In [45]:
# Q 1. Print the shape, size, and data type of scores.
print(f"The shape of scores: {scores.shape}")
print(f"The size of scores data: {scores.size}")
print(f"The dataype of scores: {scores.dtype}")

The shape of scores: (8, 5)
The size of scores data: 40
The dataype of scores: int64


In [46]:
#Q 2. Extract all Math scores (first column).
print(f"All math scores are: \n{scores[:, 0]}")

All math scores are: 
[78 92 63 88 70 82 56 90]


In [47]:
# Q 3. Find the average English score.
avg = np.mean(scores[:,2])
print(f"The average English score is: {avg}")
print(f"The average English score is: {np.mean(scores[:,2])}")

The average English score is: 82.5
The average English score is: 82.5


In [48]:
# Q 4. Replace any score below 60 with np.nan
scores_float = scores.astype(float)
scores_float[scores_float < 60] = np.nan 
print(scores_float)


[[78. 85. 90. 66. nan]
 [92. 88. 95. 77. 72.]
 [63. 70. 65. 80. 75.]
 [88. 92. 94. 85. 83.]
 [70. 68. 75. 60. nan]
 [82. 79. 88. 90. 91.]
 [nan 60. nan nan nan]
 [90. 95. 98. 89. 84.]]


In [49]:
# Q 5 Count how many scores are NaN
print(f"The number of null value in data: {np.isnan(scores_float).sum()}")

The number of null value in data: 6


## Array Operations

In [50]:
# Q 6. Compute the mean, median, and standard deviation for each subject.
#  [Math, Science, English, History, Art]
math_mean = np.mean(scores[:,0])
math_median = np.median(scores[:,0])
math_SD = np.std(scores[:,0])
print(f"Maths subject's Mean: {math_mean:.2f} | Median: {math_median:.2f} | Standard Deviation: {math_SD:.2f}")

science_mean = np.mean(scores[:,1])
science_median = np.median(scores[:,1])
science_SD = np.std(scores[:,1])
print(f"Science subject's Mean: {science_mean:.2f} | Median: {science_median:.2f} | Standard Deviation: {science_SD:.2f}")

english_mean = np.mean(scores[:,2])
english_median = np.median(scores[:,2])
english_SD = np.std(scores[:,2])
print(f"English subject's Mean: {english_mean:.2f} | Median: {english_median:.2f} | Standard Deviation: {english_SD:.2f}")

history_mean = np.mean(scores[:,3])
history_median = np.median(scores[:,3])
history_SD = np.std(scores[:,3])
print(f"History subject's Mean: {history_mean:.2f} | Median: {history_median:.2f} | Standard Deviation: {history_SD:.2f}")

art_mean = np.mean(scores[:,4])
art_median = np.median(scores[:,4])
art_SD = np.std(scores[:,4])
print(f"Art subject's Mean: {art_mean:.2f} | Median: {art_median:.2f} | Standard Deviation: {art_SD:.2f}")

# ------------------- OR --------------------------------- 
print("-------------------------------------------------------------------------------------------")

subjects = ["Math", "Science", "English", "History", "Art"]
for i, subject in enumerate(subjects):
    print(f"{subject:8s} -> Mean: {np.mean(scores[:,i]):.2f} | Meadian: {np.median(scores[:,i]):.2f} | Standard Deviation: {np.std(scores[:,i]):.2f}")

Maths subject's Mean: 77.38 | Median: 80.00 | Standard Deviation: 12.38
Science subject's Mean: 79.62 | Median: 82.00 | Standard Deviation: 11.74
English subject's Mean: 82.50 | Median: 89.00 | Standard Deviation: 14.72
History subject's Mean: 74.62 | Median: 78.50 | Standard Deviation: 13.60
Art subject's Mean: 70.50 | Median: 73.50 | Standard Deviation: 15.02
-------------------------------------------------------------------------------------------
Math     -> Mean: 77.38 | Meadian: 80.00 | Standard Deviation: 12.38
Science  -> Mean: 79.62 | Meadian: 82.00 | Standard Deviation: 11.74
English  -> Mean: 82.50 | Meadian: 89.00 | Standard Deviation: 14.72
History  -> Mean: 74.62 | Meadian: 78.50 | Standard Deviation: 13.60
Art      -> Mean: 70.50 | Meadian: 73.50 | Standard Deviation: 15.02


In [51]:
# Q 7. Find the student with the highest total score.
students_total_score = np.sum(scores, axis=1)
print(f"The total score of each students are: {students_total_score}")
highest_score_student = np.max(students_total_score)
print(f"Highest total score of student is: {highest_score_student}")
top_student_index = np.argmax(students_total_score)
print(f"Highest total score is {highest_score_student} by student {top_student_index + 1}")

The total score of each students are: [378 424 353 442 328 430 266 456]
Highest total score of student is: 456
Highest total score is 456 by student 8


In [52]:
# Q 8. Normalize the dataset (scale all values between 0 and 1)
min_score = np.min(scores)
max_score = np.max(scores)

normalize_score = (scores - min_score) / (max_score - min_score)
print(f"The normalize data (scale all values between 0 and 1) are: \n{np.round(normalize_score,2)} ")

print("-------------------------------------------------------------------------------------------")

# More appropriate in data science ML preprocessing.
norm_col = (scores - scores.min(axis=0)) / (scores.max(axis=0) - scores.min(axis=0))
print(f"Normalization column wise: \n{np.round(norm_col,2)}")

The normalize data (scale all values between 0 and 1) are: 
[[0.62 0.75 0.85 0.4  0.26]
 [0.89 0.81 0.94 0.6  0.51]
 [0.34 0.47 0.38 0.66 0.57]
 [0.81 0.89 0.92 0.75 0.72]
 [0.47 0.43 0.57 0.28 0.19]
 [0.7  0.64 0.81 0.85 0.87]
 [0.21 0.28 0.19 0.09 0.  ]
 [0.85 0.94 1.   0.83 0.74]] 
-------------------------------------------------------------------------------------------
Normalization column wise: 
[[0.61 0.71 0.81 0.4  0.3 ]
 [1.   0.8  0.93 0.68 0.59]
 [0.19 0.29 0.23 0.75 0.65]
 [0.89 0.91 0.91 0.88 0.83]
 [0.39 0.23 0.47 0.25 0.22]
 [0.72 0.54 0.77 1.   1.  ]
 [0.   0.   0.   0.   0.  ]
 [0.94 1.   1.   0.98 0.85]]


In [53]:
#Q 9. Add a new “Total” column using np.sum(axis=1).
students_total_score = np.sum(scores, axis=1)
print(f"The total score of each students are: \n{students_total_score}")
reshape_score = students_total_score.reshape(-1, 1)
total_score_col = np.concatenate((scores, reshape_score),axis = 1)
print(f"After adding 'Total' column in scores: \n{total_score_col}")

The total score of each students are: 
[378 424 353 442 328 430 266 456]
After adding 'Total' column in scores: 
[[ 78  85  90  66  59 378]
 [ 92  88  95  77  72 424]
 [ 63  70  65  80  75 353]
 [ 88  92  94  85  83 442]
 [ 70  68  75  60  55 328]
 [ 82  79  88  90  91 430]
 [ 56  60  55  50  45 266]
 [ 90  95  98  89  84 456]]


## Boolean Indexing

In [54]:
#Q 10. Find students who scored above 85 in Science.
science_student = scores[:,1] > 85
score = scores[science_student,1]
indice = np.where(scores[:,1] > 85)[0]
print(f"Students who scored above 85 in Science: {scores[science_student,1]} at index {indice}")

print("------------------- OR --------------------------------------------")

print("Students (rows) who scored >85 in Science:\n", scores[scores[:,1] > 85])


Students who scored above 85 in Science: [88 92 95] at index [1 3 7]
------------------- OR --------------------------------------------
Students (rows) who scored >85 in Science:
 [[92 88 95 77 72]
 [88 92 94 85 83]
 [90 95 98 89 84]]


In [55]:
# Q 11. Find students whose average score > 80.
avg_score = np.mean(scores, axis = 1)
student_avg = np.where(avg_score > 80)[0]
print(f"Average score of students are: {np.round(avg_score,2)}")
print(f"Student who got average greater than 80 whose index are : {student_avg}")

Average score of students are: [75.6 84.8 70.6 88.4 65.6 86.  53.2 91.2]
Student who got average greater than 80 whose index are : [1 3 5 7]


In [56]:
# Q 12. Replace all missing (np.nan) values with the subject mean.
print(f"Data with missing values: \n{scores_float}")
avg_1 = np.nanmean(scores_float, axis = 0).round(2)
indice = np.where(np.isnan(scores_float)) # give index of nan values in data
scores_float[indice] = np.take(avg_1, indice[1]) # fill mean with corresponding columns of array
print(f"Data after filling missing values with subject mean: \n{scores_float}")

print("\n---------------------- gpt version ---------------------------")

col_means = np.nanmean(scores_float, axis=0)
nan_rows, nan_cols = np.where(np.isnan(scores_float))
scores_float[nan_rows, nan_cols] = col_means[nan_cols]
print(scores_float)

Data with missing values: 
[[78. 85. 90. 66. nan]
 [92. 88. 95. 77. 72.]
 [63. 70. 65. 80. 75.]
 [88. 92. 94. 85. 83.]
 [70. 68. 75. 60. nan]
 [82. 79. 88. 90. 91.]
 [nan 60. nan nan nan]
 [90. 95. 98. 89. 84.]]
Data after filling missing values with subject mean: 
[[78.   85.   90.   66.   81.  ]
 [92.   88.   95.   77.   72.  ]
 [63.   70.   65.   80.   75.  ]
 [88.   92.   94.   85.   83.  ]
 [70.   68.   75.   60.   81.  ]
 [82.   79.   88.   90.   91.  ]
 [80.43 60.   86.43 78.14 81.  ]
 [90.   95.   98.   89.   84.  ]]

---------------------- gpt version ---------------------------
[[78.   85.   90.   66.   81.  ]
 [92.   88.   95.   77.   72.  ]
 [63.   70.   65.   80.   75.  ]
 [88.   92.   94.   85.   83.  ]
 [70.   68.   75.   60.   81.  ]
 [82.   79.   88.   90.   91.  ]
 [80.43 60.   86.43 78.14 81.  ]
 [90.   95.   98.   89.   84.  ]]


## Slicing and Broadcasting

In [57]:
# Q 13. Increase all Art marks by 5 points using broadcasting.
scores[:,4] += 5
print(f"Data after increasing the art score by 5 points: \n{scores}")

Data after increasing the art score by 5 points: 
[[78 85 90 66 64]
 [92 88 95 77 77]
 [63 70 65 80 80]
 [88 92 94 85 88]
 [70 68 75 60 60]
 [82 79 88 90 96]
 [56 60 55 50 50]
 [90 95 98 89 89]]


In [60]:
# Q 14. Swap the columns: make English first and Math last.
# Columns: [Math, Science, English, History, Art
scores[:,[0,1,2,3,4]] = scores[:,[2,1,3,4,0]]
print(f"After swapping columns: \n{scores}")

print("---------------------OR------------------")
scores_swapped = scores[:, [2,1,3,4,0]]
print(scores_swapped)

After swapping columns: 
[[66 85 64 78 90]
 [77 88 77 92 95]
 [80 70 80 63 65]
 [85 92 88 88 94]
 [60 68 60 70 75]
 [90 79 96 82 88]
 [50 60 50 56 55]
 [89 95 89 90 98]]
---------------------OR------------------
[[64 85 78 90 66]
 [77 88 92 95 77]
 [80 70 63 65 80]
 [88 92 88 94 85]
 [60 68 70 75 60]
 [96 79 82 88 90]
 [50 60 56 55 50]
 [89 95 90 98 89]]


In [59]:
# Q 15. Create a subset of the first 3 students and last 2 subjects.
scores_subset = scores[:3, 3:]
print(f"Subset of the first 3 student and last 2 subject is: \n{scores_subset}")

Subset of the first 3 student and last 2 subject is: 
[[64 78]
 [77 92]
 [80 63]]
