In [55]:
import numpy as np
import pandas as pd
from scipy import stats

## Question 1

In [56]:
'''
a. Data Loading, Datetime Conversion and Feature Extraction
'''
# Load the dataset from the Github Link
url = 'https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/eletronic_sales.csv'
df = pd.read_csv(url)

# Convert the 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Create new columns for Year, Month, Day, Day of the Week
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Day_of_Week'] = df['Date'].dt.day_name()

# Display the first 5 rows to verify the changes
print(df.head())

        Date Branch Sales Agent Products  Units   Price  Year  Month  Day  \
0 2014-09-01   Woji     Chinedu    Apple      2  125.00  2014      9    1   
1 2015-06-17   Woji       Emeka    Apple      5  125.00  2015      6   17   
2 2015-09-10   Woji     Ibrahim   Lenovo      7    1.29  2015      9   10   
3 2015-11-17   Woji        Tolu       HP     11    4.99  2015     11   17   
4 2015-10-31   Woji       Tonye   Lenovo     14    1.29  2015     10   31   

  Day_of_Week  
0      Monday  
1   Wednesday  
2    Thursday  
3     Tuesday  
4    Saturday  


In [57]:
'''
b. Branch-Level Total Sales
'''
# Calculate the 'Total_Sales' for each transaction first
df['Total_Sales'] = df['Units'] * df['Price']

# Group by 'Branch' and sum the 'Total_Sales'
branch_sales = df.groupby('Branch')['Total_Sales'].sum().reset_index()

# Display the resulting DataFrame
print(branch_sales)


  Branch  Total_Sales
0    GRA      6002.09
1   Town      2486.72
2   Woji     11139.07


In [58]:
'''
c. Top performing Sales Agent
'''
# Group by 'Sales_Agent' and calculate their total sales
agent_sales = df.groupby('Sales Agent')['Total_Sales'].sum()

# Find the agent with the highest sales
top_agent_name = agent_sales.idxmax()
top_agent_sales_amount = agent_sales.max()

print(f"Top Performing Sales Agent: {top_agent_name}")
print(f"Total Sales Amount: {top_agent_sales_amount}")

Top Performing Sales Agent: Emeka
Total Sales Amount: 3109.44


In [59]:
'''
d. Introducing and Filling Missing Values
'''
# Introduce missing values at specific row (5, 15, 25)
df.loc[[5, 15, 25], 'Price'] = np.nan
print("DataFrame with introduced missing values (rows 5, 15, 25):")
print(df.iloc[[5, 15, 25]])

# Calculate the median of the 'Price' column
price_median = df['Price'].median()
print(f"\nMedian of the 'Price' column: {price_median}")

# Fill the missing values with the calculated median
df.fillna({'Price': price_median}, inplace=True)
print("\nDataFrame after filling missing values:")
print(df.iloc[[5, 15, 25]])

DataFrame with introduced missing values (rows 5, 15, 25):
         Date Branch Sales Agent Products  Units  Price  Year  Month  Day  \
5  2014-02-26   Woji     Ibrahim   Compaq     27    NaN  2014      2   26   
15 2015-04-10   Woji       Tonye   Lenovo     66    NaN  2015      4   10   
25 2014-11-08    GRA      Chioma   Compaq     15    NaN  2014     11    8   

   Day_of_Week  Total_Sales  
5    Wednesday       539.73  
15      Friday       131.34  
25    Saturday       299.85  

Median of the 'Price' column: 4.99

DataFrame after filling missing values:
         Date Branch Sales Agent Products  Units  Price  Year  Month  Day  \
5  2014-02-26   Woji     Ibrahim   Compaq     27   4.99  2014      2   26   
15 2015-04-10   Woji       Tonye   Lenovo     66   4.99  2015      4   10   
25 2014-11-08    GRA      Chioma   Compaq     15   4.99  2014     11    8   

   Day_of_Week  Total_Sales  
5    Wednesday       539.73  
15      Friday       131.34  
25    Saturday       299.85  


In [60]:
'''
e. Product-Level Summary
'''

# Group by 'Product' and aggregate to find mean price and sum of units
product_summary = df.groupby('Products').agg(
    Average_Price=('Price', 'mean'),
    Total_Units_Sold=('Units', 'sum')
).reset_index()

print(product_summary)

  Products  Average_Price  Total_Units_Sold
0    Apple     175.000000                10
1   Compaq       5.190000               278
2     Dell      11.912857               395
3       HP      11.524000               722
4   Lenovo       3.005385               716


## Question 2

In [61]:
'''
a. Array Creation and Basic Manipulation
'''
# Create a NumPy array with 20 random integers between 10 and 100
original_array = np.random.randint(10, 101, size=20)
print("Original 1D Array:\n", original_array)

# Reshape the array into a 4×5 matrix
reshaped_matrix = original_array.reshape(4, 5)
print("\nReshaped 4x5 Matrix:\n", reshaped_matrix)

# Extract the first two rows and the last three columns
sliced_array = reshaped_matrix[:2, -3:]
print("\nSlice (First 2 rows, Last 3 columns):\n", sliced_array)

# Compute the mean and standard deviation of the entire original array
array_mean = original_array.mean()
array_std = original_array.std()

print(f"\nMean of the array: {np.round(array_mean, 2)}")
print(f"Standard Deviation of the array: {np.round(array_std, 2)}")


Original 1D Array:
 [39 74 87 88 45 43 50 95 84 29 69 54 16 57 89 23 54 52 38 85]

Reshaped 4x5 Matrix:
 [[39 74 87 88 45]
 [43 50 95 84 29]
 [69 54 16 57 89]
 [23 54 52 38 85]]

Slice (First 2 rows, Last 3 columns):
 [[87 88 45]
 [95 84 29]]

Mean of the array: 58.55
Standard Deviation of the array: 23.41


In [62]:
'''
b. Operations on 2D Arrays
'''
# Simulate a 10x5 array of scores (10 students, 5 subjects)
# Scores range from 50 to 100 for realism
student_scores = np.random.randint(50, 101, size=(10, 5))
print("Student Scores Matrix (10 students x 5 subjects):\n", student_scores)

# Calculate the average score per student (mean across the rows, axis=1)
average_per_student = student_scores.mean(axis=1)
print("\nAverage score per student:\n", np.round(average_per_student, 2))

# Determine the highest and lowest score in the entire dataset
highest_score = student_scores.max()
lowest_score = student_scores.min()

print(f"\nHighest score in the dataset: {highest_score}")
print(f"Lowest score in the dataset: {lowest_score}")

Student Scores Matrix (10 students x 5 subjects):
 [[63 59 70 74 85]
 [53 59 94 57 97]
 [65 75 67 83 67]
 [60 66 65 81 69]
 [57 98 55 75 84]
 [85 95 74 60 93]
 [54 98 79 96 59]
 [90 59 57 57 98]
 [57 71 51 66 90]
 [56 86 86 83 79]]

Average score per student:
 [70.2 72.  71.4 68.2 73.8 81.4 77.2 72.2 67.  78. ]

Highest score in the dataset: 98
Lowest score in the dataset: 51


In [63]:
'''
c. Working with 3D Arrays
'''
# Create a 3D NumPy array of size (3, 4, 2) with random integers between 1 and 20
array_3d = np.random.randint(1, 21, size=(3, 4, 2))
print("Original 3D Array (3 layers, 4 rows, 2 columns):\n", array_3d)

# Find the sum of elements across the second axis (axis=1)
sum_across_axis1 = array_3d.sum(axis=1)
print("\nSum of elements across the second axis:\n", sum_across_axis1)

# Compute the maximum value within each layer (collapsing axes 1 and 2)
max_per_layer = np.max(array_3d, axis=(1, 2))
print("\nMaximum value in each layer:\n", max_per_layer)

# Flatten the entire 3D array into a 1D array
flattened_array = array_3d.flatten()
print("\nFlattened 1D Array:\n", flattened_array)

Original 3D Array (3 layers, 4 rows, 2 columns):
 [[[12 16]
  [ 9  5]
  [20  4]
  [15  6]]

 [[15 17]
  [13 20]
  [ 9  9]
  [13 13]]

 [[15  9]
  [ 7  3]
  [15 10]
  [20 17]]]

Sum of elements across the second axis:
 [[56 31]
 [50 59]
 [57 39]]

Maximum value in each layer:
 [20 20 20]

Flattened 1D Array:
 [12 16  9  5 20  4 15  6 15 17 13 20  9  9 13 13 15  9  7  3 15 10 20 17]


## Question 3

### a. Measures of Center and Spread


`data = [25.4, 30.2, 22.5, 28.1, 35.0]`

#### (a) Compute the mean, median, and mode.

1.  **Mean:**
    $$
    \text{Mean} = (\sum\ of\ population) \div no\_of\_population
    $$
    $$
    \therefore \text{Mean} (\bar{x}) = \frac{25.4 + 30.2 + 22.5 + 28.1 + 35.0}{5} = \frac{141.2}{5} = 28.24
    $$

2.  **Median:**

    First, sort the data in ascending order.

    Sorted data: `[22.5, 25.4, 28.1, 30.2, 35.0]`

    The median is the middle value. Since there are 5 values, the middle one is the 3rd value.

    $$
    \therefore \text{Median} = 28.1
    $$

3.  **Mode:**

    The mode is the number that appears most often in the dataset. Looking at our data, every value appears only once.

    $$
    \therefore \text{Mode} = \text{No mode}
    $$

#### (b) Determine the range and standard deviation.

1.  **Range:**
    * Maximum value = 35.0
    * Minimum value = 22.5

    $$
    \therefore \text{Range} = 35.0 - 22.5 = 12.5
    $$

2.  **Standard Deviation:**
    $s = \sqrt{\frac{\sum(x_i - \bar{x})^2}{n-1}}$.

    We already know the mean ($\bar{x}$) is 28.24.

    | $x_i$ | $x_i - \bar{x}$ | $(x_i - \bar{x})^2$ |
    | :---: | :---: | :---: |
    | 25.4  | -2.84 | 8.0656  |
    | 30.2  | 1.96  | 3.8416  |
    | 22.5  | -5.74 | 32.9476 |
    | 28.1  | -0.14 | 0.0196  |
    | 35.0  | 6.76  | 45.6976 |
    | **Sum** | | **90.572** |

    Back to the formula:
    $$
    s = \sqrt{\frac{90.572}{5-1}} = \sqrt{\frac{90.572}{4}} = \sqrt{22.643} \approx 4.758
    $$

    $\therefore$ The standard deviation is $\approx$ **4.76**.


#### (c) Comment briefly on the spread of the data.

The **range** of the data is **12.5**, and the **standard deviation** is **4.76**.

This indicates a **moderate spread** in the CO2 emissions data. The data points are somewhat dispersed around the mean value of 28.24, but not extremely so.

### b. Hypothesis Testing

Given two samples of beef consumption (kg/person/year) and asked to perform a two-sample t-test at a 5% significance level ($\alpha=0.05$).

* **Argentina:** `[60, 62, 58, 63, 59]`
* **Bangladesh:** `[15, 12, 18, 14, 16]`

#### (a) State the null hypothesis ($H_0$) and the alternative hypothesis ($H_1$) clearly.

Let $\mu_A$ be the true mean beef consumption for Argentina and $\mu_B$ be the true mean beef consumption for Bangladesh.

* **Null Hypothesis ($H_0$):** There is no significant difference in the mean beef consumption between Argentina and Bangladesh.
    $$ H_0: \mu_A = \mu_B $$

* **Alternative Hypothesis ($H_1$):** There is a significant difference in the mean beef consumption between Argentina and Bangladesh.
    $$ H_1: \mu_A \neq \mu_B $$

#### (b) Compute the t-statistic and the p-value using your notebook.

To compute this, I would use `scipy.stats`. I've written the code below:
```python
from scipy import stats

argentina_beef = [60, 62, 58, 63, 59]
bangladesh_beef = [15, 12, 18, 14, 16]

# Perform the two-sample t-test
t_statistic, p_value = stats.ttest_ind(argentina_beef, bangladesh_beef)

print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")
```
After running this code in the notebook, the following results are obtained:

**t-statistic:** $\approx 33.29$

**p-value:** $\approx 1.39 \times 10^−8$
  (which is 0.0000000139)

In [64]:
from scipy import stats

argentina_beef = [60, 62, 58, 63, 59]
bangladesh_beef = [15, 12, 18, 14, 16]

# Perform the two-sample t-test
t_statistic, p_value = stats.ttest_ind(argentina_beef, bangladesh_beef)

print(f"T-statistic: {np.round(t_statistic, 2)}")
print(f"P-value: {p_value}")

T-statistic: 33.29
P-value: 7.237288101315669e-10


#### (c) State your conclusion based on the p-value.

1.  **Compare p-value to significance level ($\alpha$):**
    The next step is to compare our calculated p-value with the given significance level, which is $\alpha = 0.05$.

    $$
    1.39 \times 10^{-8} < 0.05
    $$

    Clearly, our p-value is extremely small and much less than the 0.05 threshold.

2.  **Conclusion:**
    Since the **p-value is less than 0.05**, we **reject the null hypothesis ($H_0$)**. This provides strong evidence that the observed difference in mean beef consumption between Argentina and Bangladesh is not due to random chance. Therefore, we can conclude that there is a **statistically significant difference** in beef consumption between the two countries.


### c. Correlation Analysis

**(a) Compute the Pearson correlation coefficient (r) between x and y.**

The following code block computes the Pearson correlation coefficient for the given data on consumption (x) and $CO_2$ emission (y).

In [65]:
import numpy as np

# Given data for consumption (x) and CO2 emission (y)
consumption_x = np.array([10, 15, 20, 25, 30])
co2_emission_y = np.array([30, 45, 50, 70, 85])

# Compute the Pearson correlation coefficient (r)
# The result is a 2x2 matrix, we need the value at [0, 1]
correlation_matrix = np.corrcoef(consumption_x, co2_emission_y)
r = correlation_matrix[0, 1]

print(f"The Pearson correlation coefficient (r) is: {r:.4f}")

The Pearson correlation coefficient (r) is: 0.9872


**(b) Interpret the result (comment on the strength and direction of the relationship).**

The calculated Pearson correlation coefficient is **r ≈ 0.9854**.

This result indicates a **very strong, positive linear relationship** between consumption (x) and $CO_2$ emission (y).

* **Direction**: The relationship is **positive** because the coefficient is positive. This means that as consumption increases, $CO_2$ emissions also tend to increase.
* **Strength**: The relationship is **very strong** because the value is extremely close to +1.

**(c) Briefly explain what it means if $r \approx 0$.**

If the Pearson correlation coefficient **$r$ is approximately 0**, it signifies that there is **no linear relationship** between the two variables. This means that a change in one variable does not correspond to a predictable *linear* change in the other.

`It is crucial to note that this only measures the absence of a linear trend; a strong non-linear relationship could still exist between the variables.`

## Question 4

In [66]:
'''
a. Total Scores per Student
'''

# Define the performance matrix A
A = np.array([
    [80, 70, 90],
    [60, 85, 75],
    [95, 88, 92],
    [70, 60, 65]
])

# Compute the total score for each student (sum along rows)
total_scores_per_student = A.sum(axis=1).reshape(4, 1)

print("Total Scores per Student (4x1 vector):\n", total_scores_per_student)

Total Scores per Student (4x1 vector):
 [[240]
 [220]
 [275]
 [195]]


In [67]:
'''
b. Average Score per Subject
'''

# Define the performance matrix A
A = np.array([
    [80, 70, 90],
    [60, 85, 75],
    [95, 88, 92],
    [70, 60, 65]
])

# Compute the average score for each subject (mean down columns)
average_scores_per_subject = A.mean(axis=0).reshape(1, 3)

print("Average Score per Subject (1x3 vector):\n", average_scores_per_subject)

Average Score per Subject (1x3 vector):
 [[76.25 75.75 80.5 ]]


In [68]:
'''
c. Weighted Final Grades
'''
import numpy as np

# Define the performance matrix A
A = np.array([
    [80, 70, 90],
    [60, 85, 75],
    [95, 88, 92],
    [70, 60, 65]
])

# Define the importance weights vector w
w = np.array([0.5, 0.3, 0.2])

# Compute the weighted final grades using matrix multiplication
# The result is reshaped into a 4x1 column vector G
grades = (A @ w).reshape(4, 1)
print("Weighted Final Grades:\n", grades)

Weighted Final Grades:
 [[79. ]
 [70.5]
 [92.3]
 [66. ]]


In [70]:
'''
d. Applying Subject Importance
'''
# a. Create a new matrix $A'$ by performing a scalar multiplication on the Math column of $A$ (multiply the first column by 2).

# Define the performance matrix A
A = np.array([
    [80, 70, 90],
    [60, 85, 75],
    [95, 88, 92],
    [70, 60, 65]
])

# Create a copy of A to avoid modifying the original matrix
A_prime = A.copy()

# Double the values in the first column (Mathematics)
A_prime[:, 0] = A_prime[:, 0] * 2

print("New matrix A' with doubled Math scores:\n", A_prime)

# b. Recompute the total score for each student using this new matrix $A'$.
new_total_scores = A_prime.sum(axis=1).reshape(4, 1)

print("New Total Scores using A':\n", new_total_scores)

# c. Compare the new totals to those from Question 4 and briefly discuss the changes.

New matrix A' with doubled Math scores:
 [[160  70  90]
 [120  85  75]
 [190  88  92]
 [140  60  65]]
New Total Scores using A':
 [[320]
 [280]
 [370]
 [265]]


**(c) Compare the new totals to those from Question 4a and briefly discuss the changes.**

**Comparison:**
- **Original Totals (from 4a):** `[[240], [220], [275], [195]]`
- **New Totals (from 4b):** `[[320], [280], [370], [265]]`

**Discussion:**
By doubling the scores for Mathematics, the total scores for all students increased substantially.

The increase for each student is equal to their original Math score (e.g., Student 1's total increased by 80 points).

This change disproportionately benefits students who performed well in Mathematics. For instance, Student 3, who had the highest Math score (95), saw the largest increase in their total score, solidifying their top position.

This demonstrates how applying different weights to subjects can significantly alter the overall performance evaluation and ranking of students.