In [26]:
import pandas as pd

# Data loading and preprocessing
df = pd.read_csv('/Users/evansxu/Documents/2024 application/AIBUILD/prototype/wa_secondary_schools.csv')

# Data cleaning
df = df.dropna(subset=['Suburb', 'ATAR Rank', 'Low Year', 'High Year', 'Classification Group'])
df['ATAR Rank'] = pd.to_numeric(df['ATAR Rank'], errors='coerce')
df['ICSEA'] = pd.to_numeric(df['ICSEA'], errors='coerce')
df['Median ATAR'] = pd.to_numeric(df['Median ATAR'], errors='coerce')

# Mapping grade levels to numeric values
grade_mapping = {
    'KIN': 0, 'PPR': 1, 'Y01': 1, 'Y02': 2, 'Y03': 3, 'Y04': 4,
    'Y05': 5, 'Y06': 6, 'Y07': 7, 'Y08': 8, 'Y09': 9,
    'Y10': 10, 'Y11': 11, 'Y12': 12
}
df['Low Year Numeric'] = df['Low Year'].map(grade_mapping)
df['High Year Numeric'] = df['High Year'].map(grade_mapping)

# Get user input
student_info = {
    'Suburb': input("Please enter your residential suburb: ").strip().upper(),
    'Grade': input("Please enter your current grade (e.g., Y10): ").strip().upper(),
    'Preferred School Type': input("Please enter your preferred school type (GOVERNMENT, NON-GOVERNMENT, SECONDARY SCHOOLS, DISTRICT HIGH SCHOOLS, K-12 SCHOOLS): ").strip().upper(),
    'Academic Expectation': float(input("Please enter your academic performance expectation (0-100): "))
}

# Convert student's grade to a numeric value
student_info['Grade Numeric'] = grade_mapping.get(student_info['Grade'], None)
if student_info['Grade Numeric'] is None:
    raise ValueError("Invalid grade input. Please enter a grade like 'Y10'.")

# Allow users to set weights for each criterion
print("Please assign weights to the following criteria, making sure the total adds up to 1.")
weights = {}
weights['location_weight'] = float(input("Weight for location (0-1): "))
weights['academic_weight'] = float(input("Weight for academic performance (0-1): "))
weights['type_weight'] = float(input("Weight for school type (0-1): "))
weights['grade_weight'] = float(input("Weight for grade range (0-1): "))

# Check if the sum of weights equals 1
total_weight = sum(weights.values())
if total_weight != 1.0:
    # Normalize weights if they don't sum up to 1
    for key in weights:
        weights[key] /= total_weight

# Define the scoring function
def calculate_school_score(row, student_info, weights):
    score = 0

    # Location score
    if row['Suburb'].strip().upper() == student_info['Suburb']:
        location_score = 100
    else:
        location_score = 0  # Can be extended to match neighboring suburbs
    score += location_score * weights['location_weight']

    # Academic performance score
    academic_score = 0
    if not pd.isnull(row['ATAR Rank']):
        academic_score = row['ATAR Rank']
    score += academic_score * weights['academic_weight']

    # School type score
    if row['Classification Group'].strip().upper() == student_info['Preferred School Type']:
        type_score = 100
    else:
        type_score = 0
    score += type_score * weights['type_weight']

    # Grade range score
    if row['Low Year Numeric'] <= student_info['Grade Numeric'] <= row['High Year Numeric']:
        grade_score = 100
    else:
        grade_score = 0
    score += grade_score * weights['grade_weight']

    return score

# Calculate the score for each school
df['Score'] = df.apply(lambda row: calculate_school_score(row, student_info, weights), axis=1)

# Filter schools based on academic expectation
df_filtered = df[df['ATAR Rank'] >= student_info['Academic Expectation']]

# Sort schools by score and recommend
recommended_schools = df_filtered.sort_values(by='Score', ascending=False)

# Display the top 5 recommended schools
top_schools = recommended_schools.head(5)
print("\nRecommended Schools for you:")
print(top_schools[['School Name', 'Suburb', 'Score', 'ATAR Rank']])


Please assign weights to the following criteria, making sure the total adds up to 1.

Recommended Schools for you:
                       School Name        Suburb  Score  ATAR Rank
49           CECIL ANDREWS COLLEGE      ARMADALE   67.8      139.0
109                GILMORE COLLEGE        ORELIA   67.6      138.0
77    DARLING RANGE SPORTS COLLEGE  FORRESTFIELD   67.4      137.0
244  SAFETY BAY SENIOR HIGH SCHOOL    SAFETY BAY   67.2      136.0
16     BALCATTA SENIOR HIGH SCHOOL      BALCATTA   67.0      135.0


In [37]:
import pandas as pd

# Load the data from a CSV file
df = pd.read_csv('wa_secondary_schools.csv')

# Clean the data by dropping rows with missing values in key columns
df = df.dropna(subset=['Suburb', 'ATAR Rank', 'Low Year', 'High Year', 'Classification Group', 'Total Students'])

# Convert relevant columns to numeric types, handling errors by coercing invalid values to NaN
df['ATAR Rank'] = pd.to_numeric(df['ATAR Rank'], errors='coerce')
df['ICSEA'] = pd.to_numeric(df['ICSEA'], errors='coerce')
df['Median ATAR'] = pd.to_numeric(df['Median ATAR'], errors='coerce')
df['Total Students'] = pd.to_numeric(df['Total Students'], errors='coerce')

# Define a mapping of grade levels to numeric values for easier comparison
grade_mapping = {
    'KIN': 0, 'PPR': 1, 'Y01': 1, 'Y02': 2, 'Y03': 3, 'Y04': 4,
    'Y05': 5, 'Y06': 6, 'Y07': 7, 'Y08': 8, 'Y09': 9,
    'Y10': 10, 'Y11': 11, 'Y12': 12
}

# Map the 'Low Year' and 'High Year' columns to numeric values using the grade mapping
df['Low Year Numeric'] = df['Low Year'].map(grade_mapping)
df['High Year Numeric'] = df['High Year'].map(grade_mapping)

# Get user input for the search criteria
student_info = {
    'Suburb': input("Please enter your residential suburb: ").strip().upper(),
    'Grade': input("Please enter your current grade (e.g., Y10): ").strip().upper(),
    'Preferred School Type': input("Please enter your preferred school type (GOVERNMENT, NON-GOVERNMENT, SECONDARY SCHOOLS, DISTRICT HIGH SCHOOLS, K-12 SCHOOLS): ").strip().upper(),
    'Academic Expectation': float(input("Please enter your academic performance expectation (0-100): ")),
    'School Size': input("Please enter your preferred school size (small, medium, large): ").strip().lower()
}

# Convert the user's grade to a numeric value using the grade mapping
student_info['Grade Numeric'] = grade_mapping.get(student_info['Grade'], None)
if student_info['Grade Numeric'] is None:
    raise ValueError("Invalid grade input. Please enter a grade like 'Y10'.")

# Function to categorize school size based on the total number of students
def categorize_school_size(total_students):
    small_school_threshold = 500
    large_school_threshold = 1000
    
    if total_students < small_school_threshold:
        return 'small'
    elif small_school_threshold <= total_students <= large_school_threshold:
        return 'medium'
    else:
        return 'large'

# Add a new column for school size category based on the number of total students
df['School Size Category'] = df['Total Students'].apply(categorize_school_size)

# Function to evaluate each school and return a score based on how many criteria it matches
def evaluate_school(row, student_info):
    score = 0

    # Mandatory: Check if student's grade is within the school's range; if not, return 0
    if not (row['Low Year Numeric'] <= student_info['Grade Numeric'] <= row['High Year Numeric']):
        return score
    
    # Check if the school's suburb matches the user's residential suburb
    if row['Suburb'].strip().upper() == student_info['Suburb']:
        score += 1

    # Check if the school's type matches the user's preference
    if row['Classification Group'].strip().upper() == student_info['Preferred School Type']:
        score += 1

    # Check if the school's size matches the user's preference
    if row['School Size Category'] == student_info['School Size']:
        score += 1

    # Check if academic performance meets the user's expectations
    if not pd.isnull(row['Median ATAR']) and row['Median ATAR'] >= student_info['Academic Expectation']:
        score += 1

    return score

# Apply the evaluation function to each school and store the score
df['Score'] = df.apply(lambda row: evaluate_school(row, student_info), axis=1)

# Filter out schools that didn't meet the mandatory grade criteria (score > 0)
filtered_schools = df[df['Score'] > 0]

# Sort schools by the score in descending order
recommended_schools = filtered_schools.sort_values(by='Score', ascending=False)

# Select relevant columns for output
result_columns = ['School Name', 'Suburb', 'Classification Group', 'Total Students', 'ATAR Rank', 'ICSEA', 'Median ATAR']
top_schools = recommended_schools[result_columns]

# Display the top recommended schools without showing the score
print("\nRecommended schools for you:")
print(top_schools.head(5).to_string(index=False))



Recommended schools for you:
                 School Name        Suburb Classification Group  Total Students  ATAR Rank  ICSEA  Median ATAR
               CARMEL SCHOOL      DIANELLA       NON-GOVERNMENT             476       13.0 1137.0        88.85
CHRIST CHURCH GRAMMAR SCHOOL     CLAREMONT       NON-GOVERNMENT            1688        3.0 1180.0        92.50
  DENMARK SENIOR HIGH SCHOOL       DENMARK    SECONDARY SCHOOLS             427       73.0 1038.0        78.90
  EASTERN GOLDFIELDS COLLEGE    KALGOORLIE    SECONDARY SCHOOLS             442      115.0  990.0        68.20
                 HALE SCHOOL WEMBLEY DOWNS       NON-GOVERNMENT            1594        6.0 1165.0        90.35


In [39]:
import pandas as pd

# Constants
MAX_CRITERION_SCORE = 100  # Maximum score for any criterion
SMALL_SCHOOL_MAX_SIZE = 500  # Maximum students for a small school
MEDIUM_SCHOOL_MAX_SIZE = 1000  # Maximum students for a medium school

# Load the dataset
df = pd.read_csv('wa_secondary_schools.csv')

# Clean the data by dropping rows with missing critical values
df = df.dropna(subset=[
    'Suburb', 'Median ATAR', 'Low Year', 'High Year',
    'Classification Group', 'Total Students'
])

# Convert 'Median ATAR' and 'Total Students' to numeric types
df['Median ATAR'] = pd.to_numeric(df['Median ATAR'], errors='coerce')
df['Total Students'] = pd.to_numeric(df['Total Students'], errors='coerce')

# Map grade levels to numeric values for comparison
grade_mapping = {
    'KIN': 0,  'PPR': 0,  'Y01': 1,  'Y02': 2,  'Y03': 3,
    'Y04': 4,  'Y05': 5,  'Y06': 6,  'Y07': 7,  'Y08': 8,
    'Y09': 9,  'Y10': 10, 'Y11': 11, 'Y12': 12
}
df['Low Year Numeric'] = df['Low Year'].map(grade_mapping)
df['High Year Numeric'] = df['High Year'].map(grade_mapping)

# Define a function to categorize school size
def categorize_school_size(total_students):
    if total_students < SMALL_SCHOOL_MAX_SIZE:
        return 'SMALL'
    elif total_students <= MEDIUM_SCHOOL_MAX_SIZE:
        return 'MEDIUM'
    else:
        return 'LARGE'

# Apply the function to create a 'School Size' column
df['School Size'] = df['Total Students'].apply(categorize_school_size)

# Collect user input for their preferences
student_info = {
    'Suburb': input("Please enter your residential suburb: ").strip().upper(),
    'Grade': input("Please enter your current grade (e.g., Y10): ").strip().upper(),
    'Preferred School Type': input(
        "Please enter your preferred school type (GOVERNMENT, NON-GOVERNMENT, "
        "SECONDARY SCHOOLS, DISTRICT HIGH SCHOOLS, K-12 SCHOOLS): "
    ).strip().upper(),
    'Academic Expectation': float(
        input("Please enter your minimum acceptable median ATAR score (0-100): ")
    ),
    'Preferred School Size': input(
        "Please enter your preferred school size (Small, Medium, Large): "
    ).strip().upper()
}

# Convert student's grade to a numeric value for comparison
student_info['Grade Numeric'] = grade_mapping.get(student_info['Grade'])

if student_info['Grade Numeric'] is None:
    print("Invalid grade input. Please enter a grade like 'Y10'. Exiting program.")
    exit()

# Filter schools that do not offer the user's grade
df = df[
    (df['Low Year Numeric'] <= student_info['Grade Numeric']) &
    (df['High Year Numeric'] >= student_info['Grade Numeric'])
]

# Check if any schools are left after filtering
if df.empty:
    print("No schools offer the grade you entered. Exiting program.")
    exit()

# Allow users to set weights for each criterion
print("Please assign weights to the following criteria. The total must add up to 1.")
weights = {
    'location_weight': float(input("Weight for location (0-1): ")),
    'academic_weight': float(input("Weight for academic performance (0-1): ")),
    'type_weight': float(input("Weight for school type (0-1): ")),
    'size_weight': float(input("Weight for school size (0-1): "))
}

# Ensure the total weight sums to 1
total_weight = sum(weights.values())
if total_weight != 1.0:
    # Normalize the weights if they don't sum up to 1
    weights = {k: v / total_weight for k, v in weights.items()}

# Define the scoring function
def calculate_school_score(row, student_info, weights):
    score = 0

    # Location score
    if row['Suburb'].strip().upper() == student_info['Suburb']:
        location_score = MAX_CRITERION_SCORE
    else:
        location_score = 0  # Can be extended to include nearby suburbs
    score += location_score * weights['location_weight']

    # Academic performance score (using Median ATAR)
    if not pd.isnull(row['Median ATAR']):
        # Score is proportional to Median ATAR
        academic_score = (row['Median ATAR'] / 100) * MAX_CRITERION_SCORE
    else:
        academic_score = 0
    score += academic_score * weights['academic_weight']

    # School type score
    if row['Classification Group'].strip().upper() == student_info['Preferred School Type']:
        type_score = MAX_CRITERION_SCORE
    else:
        type_score = 0
    score += type_score * weights['type_weight']

    # School size score
    if row['School Size'].strip().upper() == student_info['Preferred School Size']:
        size_score = MAX_CRITERION_SCORE
    else:
        size_score = 0
    score += size_score * weights['size_weight']

    return score

# Calculate the score for each school
df['Score'] = df.apply(
    lambda row: calculate_school_score(row, student_info, weights), axis=1
)

# Filter schools based on the minimum acceptable median ATAR
df_filtered = df[df['Median ATAR'] >= student_info['Academic Expectation']]

# Check if any schools meet the academic expectation
if df_filtered.empty:
    print("No schools meet your academic expectation. Exiting program.")
    exit()

# Sort the schools by score in descending order
recommended_schools = df_filtered.sort_values(by='Score', ascending=False)

# Display the top 5 recommended schools without 'Score' in the output
top_schools = recommended_schools.head(5)
print("\nRecommended Schools for you:")
print(top_schools[[
    'School Name', 'Suburb', 'Median ATAR', 'Classification Group', 'School Size'
]])


Please assign weights to the following criteria. The total must add up to 1.

Recommended Schools for you:
                              School Name       Suburb  Median ATAR  \
224                   PERTH MODERN SCHOOL      SUBIACO        97.55   
264  ST HILDA'S ANGLICAN SCHOOL FOR GIRLS  MOSMAN PARK        92.70   
54           CHRIST CHURCH GRAMMAR SCHOOL    CLAREMONT        92.50   
221                       PENRHOS COLLEGE         COMO        90.65   
192             METHODIST LADIES' COLLEGE    CLAREMONT        90.55   

    Classification Group School Size  
224    SECONDARY SCHOOLS       LARGE  
264       NON-GOVERNMENT       LARGE  
54        NON-GOVERNMENT       LARGE  
221       NON-GOVERNMENT      MEDIUM  
192       NON-GOVERNMENT       LARGE  
