In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('datasets/social_science_students.csv')
df.tail()

In [None]:
df.isna().sum()

# Scores Breakdown by Subjects

In [None]:
# Scores for each subject
math_score = df['math']
literature_score = df['literature']
english_score = df['english']
history_score = df['history']
geography_score = df['geography']
civic_education_score = df['civic_education']

In [None]:
def calculate_statistics(series):
    return pd.Series({
        'Mean': series.mean(),
        'Median': series.median(),
        'Mode': series.mode().iloc[0],
        'Variance': series.var(),
        'Std Dev': series.std(),
        'Range': series.max() - series.min(),
        'Highest': series.max(),
        'Lowest': series.min()
    })
    
# Calculate statistics for each subject
subjects = ['math', 'literature', 'english', 'history', 'geography', 'civic_education']
stats_df = df[subjects].apply(calculate_statistics)

print(stats_df.T)

In [None]:
def count_students_with_score(df, subject, score):
    # Filter the dataframe to include only students with the specified score
    students_with_score = df[df[subject] == score]
    
    # Count the number of students
    count = len(students_with_score)
    
    return count

subject_score_count = count_students_with_score(df, 'literature', 9.5)
print(f"Number of students who got x_score in y_subject: {subject_score_count}")

In [None]:
def find_highest_score(df, subject):
    # Sort the dataframe by the subject score in descending order
    sorted_df = df.sort_values(by=subject, ascending=False)
    
    # Get the highest score
    highest_score = sorted_df[subject].iloc[0]
    
    # Filter the dataframe to include only students with the highest score
    highest_scorers = sorted_df[sorted_df[subject] == highest_score]
    
    return highest_scorers

top_literature_students = find_highest_score(df, 'chemistry')
print(top_literature_students)

In [None]:
def plot_subject_histogram(scores, subject):
    plt.figure(figsize=(12, 8))
    
    # Determine the interval based on the subject
    if subject.lower() in ['math', 'english']:
        interval = 0.2
    else:
        interval = 0.25
    
    # Round scores to nearest interval to address floating-point imprecision
    rounded_scores = np.round(scores / interval) * interval
    
    # Create bins with a slightly larger range to ensure all data points are included
    bins = np.arange(0, 10.01 + interval, interval)
    
    # Create histogram with custom bins
    counts, bins, patches = plt.hist(rounded_scores, bins=bins, edgecolor='black', align='left')
    
    # Add count labels above each bar
    for i in range(len(counts)):
        if counts[i] > 0:
            plt.text(bins[i], counts[i] + 500, f'{int(counts[i])}', 
                     ha='center', va='bottom', rotation=90)
    
    plt.title(f'Histogram of {subject.capitalize()} Scores', fontsize=16)
    plt.xlabel('Score', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.xlim(0, 10)
    plt.ylim(0, max(counts) * 1.3)  # Extend y-axis to make room for vertical labels
    
    # Set x-axis ticks to show all intervals
    plt.xticks(np.arange(0, 10.01, interval), rotation=90, ha='center')
    
    # Adjust layout to prevent cutting off labels
    plt.tight_layout()
    
    # Add grid lines
    plt.grid(True, alpha=0.3)
    plt.grid(which='minor', alpha=0.2)
    
    plt.show()

In [None]:
plot_subject_histogram(math_score, 'math')

In [None]:
plot_subject_histogram(literature_score, 'literature')

In [None]:
plot_subject_histogram(english_score, 'english')

In [None]:
plot_subject_histogram(history_score, 'history')

In [None]:
plot_subject_histogram(geography_score, 'geography')

In [None]:
plot_subject_histogram(civic_education_score, 'civic_education')

# Scores Breakdown by Combinations

In [None]:
def plot_combination_histogram(x1, x2, x3, subject1, subject2, subject3):
    plt.figure(figsize=(15, 10))
    
    # Calculate the sum of scores for each student
    combined_scores = x1 + x2 + x3
    
    # Determine the interval based on subjects
    if 'math' in [subject1.lower(), subject2.lower(), subject3.lower()] or \
       'english' in [subject1.lower(), subject2.lower(), subject3.lower()]:
        bar_interval = 0.2
    else:
        bar_interval = 0.25
    
    # Round scores to nearest bar interval to address floating-point imprecision
    rounded_scores = np.round(combined_scores / bar_interval) * bar_interval
    
    # Create bins with a slightly larger range to ensure all data points are included
    max_possible_score = 30  # 10 points max per subject, 3 subjects
    bins = np.arange(0, max_possible_score + bar_interval, bar_interval)
    
    # Create histogram with custom bins
    counts, bins, patches = plt.hist(rounded_scores, bins=bins, edgecolor='black', align='left')
    
    # Add count labels above each bar, vertically
    for i in range(len(counts)):
        if counts[i] > 0:
            plt.text(bins[i], counts[i] + 250, str(int(counts[i])),
                     ha='center', va='bottom', rotation=90)
    
    plt.title(f'Histogram of Combined {subject1.capitalize()}, {subject2.capitalize()}, and {subject3.capitalize()} Scores', fontsize=16)
    plt.xlabel('Combined Score', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.xlim(0, max_possible_score)
    plt.ylim(0, max(counts) * 1.3)  # Extend y-axis to make room for vertical labels
    
    # Set x-axis ticks to show intervals of 0.5
    label_interval = 0.5
    x_ticks = np.arange(0, max_possible_score + label_interval, label_interval)
    plt.xticks(x_ticks, rotation=45, ha='right')
    
    # Adjust layout to prevent cutting off labels
    plt.tight_layout()
    
    # Add grid lines
    plt.grid(True, alpha=0.3)
    plt.grid(which='minor', alpha=0.2)
    
    plt.show()

In [None]:
d01_combination = plot_combination_histogram(math_score, literature_score, english_score, 'math', 'literature', 'english')

In [None]:
c00_combination = plot_combination_histogram(literature_score, history_score, geography_score, 'literature', 'history', 'geography')

In [None]:
c03_combination = plot_combination_histogram(literature_score, math_score, history_score, 'literature', 'math', 'history')