In [1]:
import requests
from bs4 import BeautifulSoup
import json
import logging
import sys

In [2]:
logger = logging.getLogger("universities-scraping")
logger.setLevel(level=logging.DEBUG)

In [3]:
logFileFormatter = logging.Formatter(
    fmt=f"%(levelname)s %(asctime)s (%(relativeCreated)d) \t %(pathname)s F%(funcName)s L%(lineno)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
fileHandler = logging.FileHandler(filename='./university-scraping.log')
fileHandler.setFormatter(logFileFormatter)
fileHandler.setLevel(level=logging.DEBUG)

logger.addHandler(fileHandler)

In [4]:
r = requests.get("https://www.thecivilindia.com/pages/education/central-universities.html")
if r.status_code == 200:
    logger.info("Scraped Central Universities successfully.")
else:
    logger.info("There was an error: ", r.status_code)

In [5]:
soup = BeautifulSoup(r.text, 'html.parser')

In [6]:
tq_div = soup.find("div", class_="three_quarter first")

In [7]:
central_universities_data  =tq_div.select("table tr")

In [8]:
del central_universities_data[0]

### Data Structure for storing college names and states.

- The college are going to mapped to states for eg
```python
names = { "maharashtra": ["Abc", "edf"], "gujarat": ["BCd"] }
```

- The states will also be stored in different list.
```python
state_names = ["maharashtra", "gujarat"]
```



In [9]:
university_state_mapping: dict = {}
state_names: set = set()

In [10]:
for data in central_universities_data:
    uni_data = data.find_all("td")
    uni_name: str = uni_data[1].get_text().strip()
    state_name: str = uni_data[2].get_text().strip().lower()
    state_names.add(state_name)
    if state_name in university_state_mapping:
        uni_list = university_state_mapping[state_name]
        uni_list.append(uni_name)
    else:
        university_state_mapping[state_name] = [uni_name]
    logger.info(f"Added {uni_name} successfuly")

## Extracting State Universities.

The state universities are the universities officialy recognized by states.

In [11]:
r_state = requests.get("https://www.thecivilindia.com/pages/education/state-universities.html")
if r_state.status_code == 200:
    logger.info("Scraped State Universities successfully.")
else:
    logger.info("There was an error: ", r_state.status_code)

In [12]:
s_soup = BeautifulSoup(r_state.text, 'html.parser')

In [13]:
s_tq_div = s_soup.find("div", class_="three_quarter first")

In [14]:
state_universities_data = s_tq_div.select("table tr")

In [15]:
del state_universities_data[0]

In [16]:
for uni in state_universities_data:
    data = uni.find_all("td")
    uni_name: str = data[1].get_text().strip()
    if len(data) > 6:
        state_name: str = data[2].get_text().strip().lower()
    state_names.add(state_name)
    if state_name in university_state_mapping:
        uni_list = university_state_mapping[state_name]
        uni_list.append(uni_name)
    else:
        university_state_mapping[state_name] = [uni_name]
    logger.info(f"Added {uni_name} successfuly")

## Extracting Deemed Universities.

Deemed university, or Deemed-to-be-University, is a status of autonomy granted by the Department of Higher Education in the Union Human Resource Development Ministry, India, on the advice of the UGC, under Section 3 of UGC Act, 1956. To quote the MHRD, "An Institution of Higher Education, other than universities, working at a very high standard in specific area of study, can be declared by the Central Government on the advice of the UGC as an Institution ‘Deemed-to-be-university’. Institutions that are ‘deemed-to-be-university’ enjoy academic status and privileges of a university."

In [17]:
r_deemed = requests.get("https://www.thecivilindia.com/pages/education/deemed-universities.html")
if r_deemed.status_code == 200:
    logger.info("Scraped Deemed Universities successfully.")
else:
    logger.info("There was an error: ", r_deemed.status_code)

In [18]:
d_soup = BeautifulSoup(r_deemed.text, 'html.parser')

In [19]:
d_tq_div = d_soup.find("div", class_="three_quarter first")

In [20]:
deemed_universities_data  = d_tq_div.select("table tr")

In [21]:
del deemed_universities_data[0]

In [22]:
for uni in deemed_universities_data:
    data = uni.find_all("td")
    uni_name: str = data[1].get_text().strip()
    state_name: str = data[3].get_text().strip().lower()
    state_names.add(state_name)
    if state_name in university_state_mapping:
        uni_list = university_state_mapping[state_name]
        uni_list.append(uni_name)
    else:
        university_state_mapping[state_name] = [uni_name]
    logger.info(f"Added {uni_name} successfuly")

In [23]:
state_names

{'andhra pradesh',
 'andhra pradesh\xa0&telangana',
 'arunachal pradesh',
 'assam',
 'bihar',
 'chandigarh',
 'chhattisgarh',
 'delhi',
 'goa',
 'gujarat',
 'haryana',
 'himachal pradesh',
 'jammu and kashmir',
 'jharkhand',
 'karnataka',
 'kerala',
 'madhya pradesh',
 'maharashtra',
 'manipur',
 'meghalaya',
 'mizoram',
 'nagaland',
 'odisha',
 'orissa',
 'puducherry',
 'punjab',
 'rajasthan',
 'sikkim',
 'tamil nadu',
 'telangana',
 'tripura',
 'uttar pradesh',
 'uttarakhand',
 'west bengal'}

## Extracting Private Universities

In [24]:
r_priv = requests.get("https://www.thecivilindia.com/pages/education/private-universities.html")
if r_priv.status_code == 200:
    logger.info("Scraped private Universities successfully.")
else:
    logger.info("There was an error: ", r_priv.status_code)

In [25]:
priv_soup = BeautifulSoup(r_priv.text, 'html.parser')

In [26]:
priv_tq_div = priv_soup.find("div", class_="three_quarter first")

In [27]:
private_universities_data  = priv_tq_div.select("table tr")

In [28]:
del private_universities_data[0]

In [29]:
for uni in private_universities_data:
    data = uni.find_all("td")
    uni_name: str = data[1].get_text().strip()
    if data[2].get_text().strip().lower() != "":
        state_name: str = data[2].get_text().strip().lower()
    if state_name == "alwar" or state_name == "jaipur":
        state_name = "rajasthan"
    elif state_name == "fatehgarh sahib":
        state_name = "punjab"
    elif state_name == "palampur":
        state_name = "himachal pradesh"
    elif state_name == "sonipat":
        state_name = "haryana"
    state_names.add(state_name)
    if state_name in university_state_mapping:
        uni_list = university_state_mapping[state_name]
        uni_list.append(uni_name)
    else:
        university_state_mapping[state_name] = [uni_name]
    logger.info(f"Added {uni_name} successfuly")

In [30]:
logger.info("Successfully scraped all university data.")

In [31]:
with open("stateWiseUnivsersies.json", "w") as fp:
    json.dump(university_state_mapping, fp)
    logger.info("Data saved to json file succesfully!")