### <span style="font-weight:bold; font-family:monospace; font-size:18px;">Step 1 - Topic: Forecast the future demand of short courses and expert courses in 2024. </span>


### <span style="font-weight:bold; font-family:monospace; font-size:18px;">Step 2 - Data Collection </b>

###     <pre style="font-weight:bold; font-size:17px">Sources:</pre>
<ul style="line-height: 1.5; font-family:monospace">
<li>Short course schools</li>
  <ul>
    <li>CSTAD</li>
    <li>Ant Training center</li>
    </ul>
<li>Survey</li>
<li>Job announcement website</li>
</ul>


<span style="font-weight: bold; color: blue; font-family:monospace">1.   Scrape data from Ant Training center</span>

In [34]:
from bs4 import BeautifulSoup
import pandas as pd
import re
import urllib.error
from urllib.request import urlopen

# Define the URL of the main page with the list of students
main_page_url = "http://training.antkh.com/students/"

# Send an HTTP GET request to the main page and parse it with Beautiful Soup
main_page_response = urlopen(main_page_url)
main_page_soup = BeautifulSoup(main_page_response, "lxml")

# Initialize a list to store student data
student_data_list = []

# Find all the student rows in the main page
student_rows = main_page_soup.find_all("tr")

list_rows = []

# Loop through each student row
for row in student_rows[1:]:
    cells = row.find_all('td')
    str_cells = str(cells)

    # Clean the HTML tags from the cell content
    clean = re.compile('<.*?>')
    clean2 = (re.sub(clean, '', str_cells))

    # Initialize variables for Khmer name and University
    km_name = ""
    university = ""

    # Split the cleaned cell content into Khmer name and University
    parts = clean2.split(',')
    
    if len(parts) >= 2:
        km_name, university = parts[0], parts[1]

    # Extract the href attribute from the first "a" element within the row
    student_link = cells[0].find("a")
    href_value = student_link.get("href")
    try:
   
         # Construct the URL for the detailed student page
        detailed_page_url = f"http://training.antkh.com/students/{href_value}"

        # Send an HTTP GET request to the detailed page and parse it with Beautiful Soup
        detailed_page_response = urlopen(detailed_page_url)
        detailed_page_soup = BeautifulSoup(detailed_page_response, "lxml")

        # Extract additional information from the detailed page
        academic_info_elem = detailed_page_soup.find("div", id="c_pAcademic")
        academic_info = academic_info_elem.get_text(strip=True) if academic_info_elem else ""
        academic_year_match = re.search(r'(\d{4})', academic_info)
        academic_year = academic_year_match.group(1) if academic_year_match else 0

        study_info_elem = detailed_page_soup.find("div", class_="study-info")
        study_info_items = study_info_elem.find_all("li")
        courses = [item.get_text() for item in study_info_items]
        courses_str = ', '.join(courses)

        profile_content = detailed_page_soup.find("div", class_="inner-content")
        eng_name = profile_content.find("h2").get_text()
    except urllib.error.URLError as e:
        print(f"Failed to fetch the page: {e}")

    # Append the extracted data to the list
    list_rows.append([eng_name, km_name, university, courses_str, academic_year])

# Create a DataFrame with the extracted data
df = pd.DataFrame(list_rows, columns=["English Name", "Khmer Name", "University", "Courses", "Academic Year", ])

# Remove square brackets from Khmer Name, University, and Courses columns
df["Khmer Name"] = df["Khmer Name"].str.strip("[")
df["University"] = df["University"].str.strip("]")
df["Courses"] = df["Courses"].str.strip("[]")

# New DataFrame containing only rows with academic year > 2020
filtered_df = df[df["Academic Year"].astype(int) > 2020]

# Filter 100 records from each of the academic years 2017, 2018, and 2019
academic_years = [2017, 2018, 2019]
filtered_records = []

for year in academic_years:
    records_for_year = df[pd.to_numeric(df["Academic Year"], errors='coerce', downcast='integer') == year]
    if len(records_for_year) >= 100:
        random_records = records_for_year.sample(n=100, random_state=42)
        filtered_records.append(random_records)

# Concatenate the filtered records to the existing filtered_df
filtered_df = pd.concat([filtered_df] + filtered_records)

# Save the filtered data to a CSV file
filtered_df.to_csv("ant_training_students.csv", index=False)
filtered_df.head(100)

Failed to fetch the page: <urlopen error [Errno 60] Operation timed out>
Failed to fetch the page: <urlopen error [Errno 60] Operation timed out>
filter re [         English Name      Khmer Name                      University  \
1928       Nov Vanndy      នៅ វ៉ាន់ឌី   សាកលវិទ្យាល័យ ភូមិន្ទ​ភ្នំពេញ   
2023    Sar Sovanthea     សរ សុវណ្ណធា   សាកលវិទ្យាល័យ ភូមិន្ទ​ភ្នំពេញ   
1759    Nhim Vongmaly   ញឹម វង្សម៉ាលី   សាកលវិទ្យាល័យ ភូមិន្ទ​ភ្នំពេញ   
2034        Tek David       តឹក ដេវីដ               វិទ្យាស្ថានស៊ីតិក   
1710    Khean Rathana        ឃៀន រតនា            សាកលវិទ្យាល័យ ន័រតុន   
...               ...             ...                             ...   
1725      Seam Piseth    ស៊ាម ពិសិដ្ឋ   សាកលវិទ្យាល័យ ភូមិន្ទ​ភ្នំពេញ   
1949  Gnet Chanmanith  ញឹក ចាន់ម៉ានិត     សាកលវិទ្យាល័យ ជាតិគ្រប់គ្រង   
1998    Hong MengHort   ហុង ម៉េងហ៊ាត់            សាកលវិទ្យាល័យ ន័រតុន   
1825    Penh Socheara     ពេញ សុជារ៉ា   សាកលវិទ្យាល័យ ភូមិន្ទ​ភ្នំពេញ   
1925     Such Sambath    សុច សម្បត្តិ   

Unnamed: 0,English Name,Khmer Name,University,Courses,Academic Year
0,Pen Sokrith,ពេន សុខរិទ្ឋ,Norton university,HTML+HTML5&CSS,2023
1,Troeng Chounmann,ត្រឹង ជនម៉ាន់,សាកលវិទ្យាល័យ ជាតិគ្រប់គ្រង,HTML+HTML5&CSS,2023
2,Lim Lay Heng,លឹម ឡាយហេង,វិទ្យាស្ថានស៊ីតិក,C/C++,2023
3,Tan Reaksa,តាំន់ រក្សា,សាកលវិទ្យាល័យ ភូមិន្ទ​ភ្នំពេញ,C/C++,2023
4,Koev Sopheaktra,គៅ សុភក្រ្តា,សាកលវិទ្យាល័យ ភូមិន្ទ​ភ្នំពេញ,HTML+HTML5&CSS,2023
...,...,...,...,...,...
95,Pov Chanvannak,ពៅ ចន្ទ័វណ្ណៈ,សាកលវិទ្យាល័យ ភូមិន្ទ​ភ្នំពេញ,HTML+HTML5&CSS,2023
96,Heng Meng Heang,ហេង ម៉េងហ៊ាង,សាកលវិទ្យាល័យ ភូមិន្ទ​ភ្នំពេញ,C/C++,2023
97,Sound Panha,សោន បញ្ញា,សាកលវិទ្យាល័យ ន័រតុន,"C/C++, HTML+HTML5&CSS, Website Design With CSS...",2023
98,Say Piseth,សាយ​ ពិសិដ្ឋ,សាកលវិទ្យាល័យ ភូមិន្ទ​ភ្នំពេញ,"C/C++, C++OOP",2023


<span style="font-weight: bold; color: blue; font-family:monospace">2.   Import data from CSTAD short course, survey data and Ant training data</span>

In [4]:
import pandas as pd

cstad_file_path = 'cstad_short_course_data.xlsx'
cstad_df = pd.read_excel(cstad_file_path)

ant_file_path = 'ant_training_students.csv'
ant_df = pd.read_csv(ant_file_path)

  for idx, row in parser.parse():
