In [1]:
import numpy as np
import pandas as pd

### Question1 - Pandas for Data Analysis

In [2]:
# a. Data Loading, Datatime Conversion and Feature Extraction
# Load the dataset from the GitHub link. Convert the Date column to datetime format, and create new columns for Year, Month, Day, and Day_of_Week.


url = 'https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/eletronic_sales.csv'
data = pd.read_csv(url)


data['Date'] = pd.to_datetime(data['Date'])
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
data['Day_of_Week'] = data['Date'].dt.day_name()
data.head()

Unnamed: 0,Date,Branch,Sales Agent,Products,Units,Price,Year,Month,Day,Day_of_Week
0,2014-09-01,Woji,Chinedu,Apple,2,125.0,2014,9,1,Monday
1,2015-06-17,Woji,Emeka,Apple,5,125.0,2015,6,17,Wednesday
2,2015-09-10,Woji,Ibrahim,Lenovo,7,1.29,2015,9,10,Thursday
3,2015-11-17,Woji,Tolu,HP,11,4.99,2015,11,17,Tuesday
4,2015-10-31,Woji,Tonye,Lenovo,14,1.29,2015,10,31,Saturday


In [6]:
# b. Branch-Level Total Sales
# Calculate the total sales for each branch, where Total Sales = Units x Price. Return a new DataFrame showing Branch and Total_Sales.

total_sales = data.copy()
total_sales['Total_Sales'] = total_sales['Units'] * total_sales['Price']
branch_sales = total_sales.groupby('Branch')['Total_Sales'].sum().reset_index()
branch_sales

Unnamed: 0,Branch,Total_Sales
0,GRA,6002.09
1,Town,2486.72
2,Woji,11139.07


In [None]:
# c. Top Performing Sales Agent
# Determine the top-performing sales agent based on total sales across all brancehs. Display both the agent's name and their total sales amount.

agent_sales = total_sales.groupby('Sales Agent')['Total_Sales'].sum().reset_index()
top_agent = agent_sales.loc[agent_sales['Total_Sales'].idxmax()]
top_agent

Sales Agent      Emeka
Total_Sales    3109.44
Name: 3, dtype: object

In [10]:
# d. Introducing and Filling Missing Values
# Using NumPy, introduce missing values in the Price column for rows 5, 15, and 25. After that, fill the missing values using the median of the Price column.

# Introduce missing values in rows 5, 15, and 25.
data.loc[[5, 15, 25], 'Price'] = np.nan

print("DataFrame with missing values:")
print(data)

# Calculate the median of the Price column
price_median = data['Price'].median()

# Fill the missing values with the calculated median
data['Price'] = data['Price'].fillna(price_median)

print("\nDataFrame after filling missing values with the median:")
data.iloc[[5, 15, 25]]


DataFrame with missing values:
         Date Branch Sales Agent Products  Units   Price  Year  Month  Day  \
0  2014-09-01   Woji     Chinedu    Apple      2  125.00  2014      9    1   
1  2015-06-17   Woji       Emeka    Apple      5  125.00  2015      6   17   
2  2015-09-10   Woji     Ibrahim   Lenovo      7    1.29  2015      9   10   
3  2015-11-17   Woji        Tolu       HP     11    4.99  2015     11   17   
4  2015-10-31   Woji       Tonye   Lenovo     14    1.29  2015     10   31   
5  2014-02-26   Woji     Ibrahim   Compaq     27     NaN  2014      2   26   
6  2014-10-05   Woji      George       HP     28    8.99  2014     10    5   
7  2015-12-21   Woji       Tonye       HP     28    4.99  2015     12   21   
8  2014-02-09   Woji        Tolu   Lenovo     36    4.99  2014      2    9   
9  2015-08-07   Woji       Emeka     Dell     42   23.95  2015      8    7   
10 2015-01-15   Woji     Ibrahim       HP     46    8.99  2015      1   15   
11 2014-01-23   Woji       Emeka 

Unnamed: 0,Date,Branch,Sales Agent,Products,Units,Price,Year,Month,Day,Day_of_Week
5,2014-02-26,Woji,Ibrahim,Compaq,27,4.99,2014,2,26,Wednesday
15,2015-04-10,Woji,Tonye,Lenovo,66,4.99,2015,4,10,Friday
25,2014-11-08,GRA,Chioma,Compaq,15,4.99,2014,11,8,Saturday


In [11]:
# e. Productive-Level Summary

product_summary = data.groupby('Products').agg(
    Average_Price=('Price', 'mean'),
    Total_Units_Sold=('Units', 'sum')
).reset_index()
product_summary


Unnamed: 0,Products,Average_Price,Total_Units_Sold
0,Apple,175.0,10
1,Compaq,5.19,278
2,Dell,11.912857,395
3,HP,11.524,722
4,Lenovo,3.005385,716


### Question2 - NumPy for numeric computation

In [12]:
# a. Array Creation and Basic a. Array Creation and Basic Manipulation (6 marks)
# Create a NumPy array containing 20 random integers between 10 and 100. Then perform the following tasks:
# Reshape the array into a 4x5 matrix.
# Extract the first two rows and last three columns from the reshaped array.
# Create a NumPy array containing 20 random integers between 10 and 100

arr = np.random.randint(10, 101, size=20)

# Reshape the array into a 4x5 matrix
arr_reshaped = arr.reshape(4, 5)

# Extract the first two rows and last three columns
extracted = arr_reshaped[:2, -3:]

# Compute the mean and standard deviation of the entire array
mean_val = arr.mean()
std_val = arr.std()

print("Original array:\n", arr)
print("\nReshaped 4x5 matrix:\n", arr_reshaped)
print("\nFirst two rows, last three columns:\n", extracted)
print(f"\nMean: {mean_val}, Standard Deviation: {std_val}")


Original array:
 [54 57 50 67 92 34 89 82 47 53 59 87 43 41 68 83 83 38 81 60]

Reshaped 4x5 matrix:
 [[54 57 50 67 92]
 [34 89 82 47 53]
 [59 87 43 41 68]
 [83 83 38 81 60]]

First two rows, last three columns:
 [[50 67 92]
 [82 47 53]]

Mean: 63.4, Standard Deviation: 18.21098569545317


In [13]:
# b. Operations on 2D Arrays
# Simulate a 2D array representing students' scores in 5 subjects (10 students).
# Calculate the average score per student.
# Determine the highest and lowest score in the dataset.
scores = np.random.randint(40, 101, size=(10, 5))
avg_per_student = scores.mean(axis=1)
highest_score = scores.max()
lowest_score = scores.min()

print("Scores:\n", scores)
print("\nAverage score per student:\n", avg_per_student)
print(f"\nHighest score: {highest_score}, Lowest score: {lowest_score}")

Scores:
 [[ 83  48  57  63  83]
 [ 90  91  53  68  40]
 [ 73  47  67  57  83]
 [ 51  66  93  46  44]
 [ 83  85  78  89  74]
 [ 53  42  76  46  50]
 [ 91  48  49  74  66]
 [ 98  74  71  57  53]
 [ 86  87  53  75  92]
 [ 70  43  78  46 100]]

Average score per student:
 [66.8 68.4 65.4 60.  81.8 53.4 65.6 70.6 78.6 67.4]

Highest score: 100, Lowest score: 40


In [14]:
# c. Working with Arrays
# Create a 3D NumPy array with dimensions (3, 4, 2) filled with random integers between 1 and 20
arr3d = np.random.randint(1, 21, size=(3, 4, 2))

# Find the sum of elements across the second axis (axis=1)
sum_axis1 = arr3d.sum(axis=1)

# Compute the maximum value along each layer (axis=2)
max_per_layer = arr3d.max(axis=2)

# Flatten the entire 3D array into a 1D array
flattened = arr3d.flatten()

print("3D Array:\n", arr3d)
print("\nSum across the second axis:\n", sum_axis1)
print("\nMaximum value along each layer:\n", max_per_layer)
print("\nFlattened array:\n", flattened)

3D Array:
 [[[ 3 14]
  [17 10]
  [ 6 20]
  [20 20]]

 [[12 20]
  [19  3]
  [13 18]
  [16  4]]

 [[ 4 18]
  [15  5]
  [ 9 14]
  [13 13]]]

Sum across the second axis:
 [[46 64]
 [60 45]
 [41 50]]

Maximum value along each layer:
 [[14 17 20 20]
 [20 19 18 16]
 [18 15 14 13]]

Flattened array:
 [ 3 14 17 10  6 20 20 20 12 20 19  3 13 18 16  4  4 18 15  5  9 14 13 13]


### Question3 - Statistics for statistical analysis

**a. Measures of Center and Spread**

Given the dataset of $CO_2$ emissions (in metric tons per capita) from five countries: [25.4, 30.2, 22.5, 28.1, 35.0]
(a) Compute the mean, median, and mode.
(b) Determine the range and standard deviation.
(c) Comment briefly on the spread of the data.

### a. Measures of Center and Spread
We are given the dataset of $CO_2$ emissions (in metric tons per capita) from five countries:
data = [25.4, 30.2, 22.5, 28.1, 35.0]
#### (a) Compute the mean, median, and mode.
i.  *Mean (Average):*
    To find the mean, I'll sum all the values and divide by the number of values (which is 5).
    $$
    \text{Mean} (\bar{x}) = \frac{25.4 + 30.2 + 22.5 + 28.1 + 35.0}{5} = \frac{141.2}{5} = 28.24
    $$
ii.  *Median (Middle Value):*
    First, I need to sort the data in ascending order.
    Sorted data: [22.5, 25.4, 28.1, 30.2, 35.0]
    The median is the middle value. Since there are 5 values, the middle one is the 3rd value.
    $$
    \text{Median} = 28.1
    $$
iii.  *Mode (Most Frequent Value):*
    The mode is the number that appears most often in the dataset. Looking at our data, every value appears only once.
    $$
    \text{Mode} = \text{No mode}
    $$

**b. Hypothesis Testing**

Two samples of beef consumption (kg/person/year) are given:
-Argentina: [60, 62, 58, 63, 59]
-Bangladesh: [15, 12, 18, 14, 16]
Perform a two-sample t-test at a 5% significance level ($\alpha=0.05$) to determine whether there is a significant difference in mean beef consumption between the two countries.
- (a) State the null hypothesis ($H_0$) and the alternative hypothesis ($H_1$) clearly.
- (b) Compute the t-statistic and the p-value using your notebook.


We are given two samples of beef consumption (kg/person/year) and asked to perform a two-sample t-test at a 5% significance level ($\alpha=0.05$).
* *Argentina:* [60, 62, 58, 63, 59]
* *Bangladesh:* [15, 12, 18, 14, 16]
#### (a) State the null hypothesis ($H_0$) and the alternative hypothesis ($H_1$) clearly.
Let $\mu_A$ be the true mean beef consumption for Argentina and $\mu_B$ be the true mean beef consumption for Bangladesh.
* *Null Hypothesis ($H_0$):* There is no significant difference in the mean beef consumption between Argentina and Bangladesh.
    $$ H_0: \mu_A = \mu_B $$
* *Alternative Hypothesis ($H_1$):* There is a significant difference in the mean beef consumption between Argentina and Bangladesh.
    $$ H_1: \mu_A \neq \mu_B $$
#### (b) Compute the t-statistic and the p-value using your notebook.
The two-sample independent t-test is performed using the provided data for beef consumption in Argentina and Bangladesh.
* *Argentina:* [60, 62, 58, 63, 59]
* *Bangladesh:* [15, 12, 18, 14, 16]

After running this code in the notebook, the following results are obtained:
* *t-statistic:* ≈33.29
* *p-value:* ≈1.39×10−8 (which is 0.0000000139)

####  (c) State your conclusion based on the p-value.

With a p-value of $$ 1.39\times 10^{-8} $$, which is far below the 0.05 significance level, there is overwhelming statistical evidence to conclude that the beef consumption is significantly different between Argentina and Bangladesh. This observed difference is highly unlikely to have occurred by random chance.

#### Question4 - Linear Algebra

In [15]:
# We are analyzing the performance of 4 students in 3 subjects: Mathematics, English, and Science. The data is represented by the matrix $A$: $$A = \begin{bmatrix} 80 & 70 & 90 \ 60 & 85 & 75 \ 95 & 88 & 92 \ 70 & 60 & 65 \end{bmatrix}$$ Each row represents a student, and each column represents a subject (Math, English, Science).

# a. Total Scores per Student
# Compute the total score for each student by summing the elements in each row. Present your result as a $4 \times 1$ column vector.
A = np.array([[80, 70, 90],
                [60, 85, 75],
                [95, 88, 92],
                [70, 60, 65]])
total_scores = A.sum(axis=1).reshape(-1, 1)
total_scores

array([[240],
       [220],
       [275],
       [195]])

In [16]:
# b. Average Scores per Subject
# Compute the average score for each subject by calculating the mean of each column of matrix $A$. Present your result as a $1 \times 3$ row vector representing the averages for Math, English, and Science.

average_scores = A.mean(axis=0).reshape(1, -1)
average_scores

array([[76.25, 75.75, 80.5 ]])

In [17]:
# c. Weighted Final Grades
# The subjects have importance weights given by the vector $w = [0.5, 0.3, 0.2]$. Use matrix multiplication to compute each student's weighted final grade. The operation is $G = A w^T$. Show the resulting column vector $G$.

w = np.array([0.5, 0.3, 0.2])
G = A @ w.reshape(-1, 1)
G

array([[79. ],
       [70.5],
       [92.3],
       [66. ]])

In [18]:
# d. Applying subject Importance
# Suppose Mathematics is considered twice as important as English and Science.
# (a) Create a new matrix $A'$ by performing a scalar multiplication on the Math column of $A$ (multiply the first column by 2).
# (b) Recompute the total score for each student using this new matrix $A'$.
# (c) Compare the new totals to those from Question 6 and briefly discuss the changes.

# (a) Multiply the Math column (first column) by 2 to create A'
A_prime = A.copy()
A_prime[:, 0] = 2 * A_prime[:, 0]

# (b) Recompute the total score for each student using A'
total_scores_prime = A_prime.sum(axis=1).reshape(-1, 1)

print("Original total scores per student:\n", total_scores)
print("\nNew total scores per student with Math weighted double:\n", total_scores_prime)

# (c) Brief discussion:
# The new totals are higher for all students because the Math scores now contribute twice as much.
# Students with higher Math scores benefit more from this weighting.

Original total scores per student:
 [[240]
 [220]
 [275]
 [195]]

New total scores per student with Math weighted double:
 [[320]
 [280]
 [370]
 [265]]
