Import the necessary libraries for scraping

In [79]:
import requests
from bs4 import BeautifulSoup
from dateutil import parser
import re
from datetime import datetime
import pprint
import json

## **Functions**

Functions used within the scraper

In [81]:
def get_dob(text):
  '''
  This function takes in the STRING which has the date of birth and extract it.

  :param text: ஞானதேசிகன், இராசையா, சூன் 2, 1943 (அகவை 78) , பண்ணைப்புரம், தேனி, சென்னை மாகாணம் இந்தியா
  :return: 1943-06-02
  '''
  if text == None:
    return text
    
  match_str = re.search(r'\d{4}-\d{2}-\d{2}', text)

  if match_str != None:
    res = datetime.strptime(match_str.group(), '%Y-%m-%d').date()
    return res

  else:

    try:
      # extracting date using inbuilt func.
      res = parser.parse(text, fuzzy=True)

      return res
    
    except Exception:

      return None

def scrape_tamil_page(link): # returns English Link, Intro, Content, Profession, Music formats and Instruments played
  '''
  This function takes in the link to a Wikipedia page in Thamizh and returns the content

  :param link: https://ta.wikipedia.org/wiki/%E0%AE%87%E0%AE%B3%E0%AF%88%E0%AE%AF%E0%AE%B0%E0%AE%BE%E0%AE%9C%E0%AE%BE
  :return: https://en.wikipedia.org/wiki/Ilaiyaraaja, Introduction in Thamizh , Content in Thamizh, [Professions], [Music formats], [Instruments]
  '''
  if link == None:
    return None, None

  else:

    response = requests.get(url=link)
    soup = BeautifulSoup(response.content, 'html.parser')

    english_link = soup.find('a', {'class': "interlanguage-link-target", 'lang': "en"})['href']
    rows = soup.find('div', class_="mw-parser-output")

    intro, content, intro_flag = "", "", True

    for row in rows:

      if row.name == 'h2':
        intro_flag=False

      if intro_flag:
        if row.name == 'p':
          intro += row.text
      else:
        if row.name == 'p':
          content += row.text

    if content == "":
      content = None

    professions, music_formats, instruments_played = [], [], []

    try:
      results = soup.find('table', class_="infobox vcard plainlist").find_all('tr')
    except AttributeError:
      try:
        results = soup.find('table', class_="infobox biography vcard").find_all('tr')
      except AttributeError:
        try:
          results = soup.find('table', class_="infobox vcard").find_all('tr')
        except AttributeError: # infobox vevent
          try:
            results = soup.find('table', class_="infobox vevent").find_all('tr')
          except:
            return english_link, clean(intro), clean(content), professions, music_formats, instruments_played


    for item in results:
      if item.find('th') != None:
        if item.find('th').text in ['தொழில்(கள்)', 'பணி', 'அறியப்படுவது', 'தொழில்']:
          professions = item.find('td').text.strip().split(",")
        if item.find('th').text in ['இசை வடிவங்கள்']:
          music_formats = item.find('td').text.strip().split(",")
        if item.find('th').text in ['இசைக்கருவி(கள்)', 'இசைக்கருவி']:
          instruments_played = item.find('td').text.strip().split(",")

    return english_link, clean(intro), clean(content), professions, music_formats, instruments_played

def scrape_english_page(english_link): # Returns place of birth, date of birth, years active and image url
  '''
  This function takes in the english link and returns the place of birth, date of birth,  years active and image url of the artist

  :param english_link:  https://en.wikipedia.org/wiki/Ilaiyaraaja
  :return:  "Pannaipuram, Madura District, Madras Presidency, British India" , "2 June 1943 (age 78)" , "1976–present" , url_to_image
  '''
  response = requests.get(url=english_link)
  soup = BeautifulSoup(response.content, 'html.parser')
  birth, years_active, image = None, None, None

  try:
    results = soup.find('table', class_="infobox vcard plainlist").find_all('tr')
  except AttributeError:
    try:
      results = soup.find('table', class_="infobox biography vcard").find_all('tr')
    except AttributeError:
      try:
        results = soup.find('table', class_="infobox vcard").find_all('tr')
      except: # infobox vevent
        results = soup.find('table', class_="infobox vevent").find_all('tr')

  if soup.find('td', class_="infobox-image") != None:
    image = soup.find('td', class_="infobox-image").find('img')['src'][2:]

  for item in results:
    if item.find('th') == None:
      continue
    else:
      if item.find('th').string == "Born":
        birth = re.sub(r'\[.*?\]+', '', item.find('td').text)
        # birth = re.sub(r'\[.*?\]+', '', item.find('td').string)
      try:
        if all(x in item.find('th').string.lower() for x in ['years', 'active']):
          years_active = item.find('td').text
          # years_active = item.find('td').string
      except AttributeError:
        continue

  cut = None

  if birth != None:
    for i in range(len(birth)-1, 0 , -1):
      if birth[i].isnumeric():
        cut = i
        break

    if cut == len(birth)-1:
      date_of_birth = birth
      place_of_birth = None

    if cut!= None:
      place_of_birth = birth[cut+1:]
      date_of_birth = birth[:cut+1]
    else:
      place_of_birth = birth
      date_of_birth = None

  else:
    place_of_birth = None
    date_of_birth = None

  return(place_of_birth, date_of_birth, years_active, image)

In [82]:
def clean(text):
  if text != None:
    text = re.sub(r'\[.*?\]+', '', text)
    text = text.replace('\n', '')
    return text
  
  else:
    return text

## **List of Tamil Singer**

The below web page is initially scraped to get a list of famous singers.

In [83]:
response = requests.get(url="https://ta.wikipedia.org/wiki/%E0%AE%A4%E0%AE%AE%E0%AE%BF%E0%AE%B4%E0%AF%8D%E0%AE%A4%E0%AF%8D_%E0%AE%A4%E0%AE%BF%E0%AE%B0%E0%AF%88%E0%AE%AA%E0%AF%8D%E0%AE%AA%E0%AE%9F%E0%AE%AA%E0%AF%8D_%E0%AE%AA%E0%AE%BE%E0%AE%9F%E0%AE%95%E0%AE%B0%E0%AF%8D%E0%AE%95%E0%AE%B3%E0%AE%BF%E0%AE%A9%E0%AF%8D_%E0%AE%AA%E0%AE%9F%E0%AF%8D%E0%AE%9F%E0%AE%BF%E0%AE%AF%E0%AE%B2%E0%AF%8D")
soup = BeautifulSoup(response.content, "html.parser")

In [84]:
results = soup.find('div', class_="mw-parser-output")
results = results.find_all('a')
x = {}
for result in results:
  x[result.string] = {
      'tamil_link' : "https://ta.wikipedia.org"+result['href'],
      'முதல் படம்' : None,
      'அறிமுக ஆண்டு': None
  }

In [85]:
pprint.pprint(x)

{'உதித் நாராயண்': {'tamil_link': 'https://ta.wikipedia.org/wiki/%E0%AE%89%E0%AE%A4%E0%AE%BF%E0%AE%A4%E0%AF%8D_%E0%AE%A8%E0%AE%BE%E0%AE%B0%E0%AE%BE%E0%AE%AF%E0%AE%A3%E0%AF%8D',
                   'அறிமுக ஆண்டு': None,
                   'முதல் படம்': None},
 'உன்னிகிருஷ்ணன்': {'tamil_link': 'https://ta.wikipedia.org/wiki/%E0%AE%89%E0%AE%A9%E0%AF%8D%E0%AE%A9%E0%AE%BF%E0%AE%95%E0%AE%BF%E0%AE%B0%E0%AF%81%E0%AE%B7%E0%AF%8D%E0%AE%A3%E0%AE%A9%E0%AF%8D',
                    'அறிமுக ஆண்டு': None,
                    'முதல் படம்': None},
 'உன்னிமேனன்': {'tamil_link': 'https://ta.wikipedia.org/wiki/%E0%AE%89%E0%AE%A9%E0%AF%8D%E0%AE%A9%E0%AE%BF%E0%AE%AE%E0%AF%87%E0%AE%A9%E0%AE%A9%E0%AF%8D',
                'அறிமுக ஆண்டு': None,
                'முதல் படம்': None},
 'எஸ். பி. பாலசுப்பிரமணியம்': {'tamil_link': 'https://ta.wikipedia.org/wiki/%E0%AE%8E%E0%AE%B8%E0%AF%8D._%E0%AE%AA%E0%AE%BF._%E0%AE%AA%E0%AE%BE%E0%AE%B2%E0%AE%9A%E0%AF%81%E0%AE%AA%E0%AF%8D%E0%AE%AA%E0%AE%BF%E0%AE%B0%E0%AE%AE%E0%AE%A3%E0%

In [86]:
del x['தாராபுரம் சுந்தரராஜன்']
x['ஸ்ரீநிவாஸ்']['tamil_link'] = 'https://ta.wikipedia.org/wiki/%E0%AE%B8%E0%AF%8D%E0%AE%B0%E0%AF%80%E0%AE%A8%E0%AE%BF%E0%AE%B5%E0%AE%BE%E0%AE%B8%E0%AF%8D_(%E0%AE%AA%E0%AE%BE%E0%AE%9F%E0%AE%95%E0%AE%B0%E0%AF%8D)'
x['ஹரிஹரன்']['tamil_link'] = 'https://ta.wikipedia.org/wiki/%E0%AE%B9%E0%AE%B0%E0%AE%BF%E0%AE%B9%E0%AE%B0%E0%AE%A9%E0%AF%8D_(%E0%AE%AA%E0%AE%BE%E0%AE%9F%E0%AE%95%E0%AE%B0%E0%AF%8D)'

In [87]:
for key in x.keys():
  if x[key]['tamil_link'] == None:
    x[key]['english_link'], x[key]['intro'], x[key]['content'], x[key]['profession'], x[key]['music_formats'], x[key]['instruments_played'] = None, None, None, [], [], []
  else:
    try:
      x[key]['english_link'], x[key]['intro'], x[key]['content'], x[key]['profession'], x[key]['music_formats'], x[key]['instruments_played'] = scrape_tamil_page(x[key]['tamil_link'])
    except TypeError:
      print(key, x[key]['tamil_link'])
    except UnboundLocalError:
      print("Unbound" ,key, x[key]['tamil_link'])
    except AttributeError:
      print("Attribute", key, x[key]['tamil_link'])

மது பாலகிருஷ்ணன் https://ta.wikipedia.org/wiki/%E0%AE%AE%E0%AE%A4%E0%AF%81_%E0%AE%AA%E0%AE%BE%E0%AE%B2%E0%AE%95%E0%AE%BF%E0%AE%B0%E0%AF%81%E0%AE%B7%E0%AF%8D%E0%AE%A3%E0%AE%A9%E0%AF%8D


In [88]:
response = requests.get(x['மது பாலகிருஷ்ணன்']['tamil_link'])
soup = BeautifulSoup(response.content, 'html.parser')

rows = soup.find('div', class_="mw-parser-output")

intro, content, intro_flag = "", "", True

for row in rows:
  
  if row.name == 'h2':
    intro_flag=False

  if intro_flag:
    if row.name == 'p':
      intro += row.text

if content == "":
  content = None

x['மது பாலகிருஷ்ணன்']['intro'], x['மது பாலகிருஷ்ணன்']['content'], x['மது பாலகிருஷ்ணன்']['english_link'], x['மது பாலகிருஷ்ணன்']['profession'], x['மது பாலகிருஷ்ணன்']['music_formats'], x['மது பாலகிருஷ்ணன்']['instruments_played'] = intro, content, None, [], [], []

In [89]:
for key in x.keys():
  if x[key]['english_link'] == None:
    x[key]['place_of_birth'], x[key]['date_of_birth'], x[key]['years_active'], x[key]['image'] = None, None, None, None
  else:
    try:
      x[key]['place_of_birth'], x[key]['date_of_birth'], x[key]['years_active'], x[key]['image'] = scrape_english_page(x[key]['english_link'])
    except TypeError:
      print(x[key]['english_link'])

In [90]:
x['மது பாலகிருஷ்ணன்']['place_of_birth'], x['மது பாலகிருஷ்ணன்']['date_of_birth'], x['மது பாலகிருஷ்ணன்']['years_active'], x['மது பாலகிருஷ்ணன்']['image'] = 'Thiruvananthapuram, Kochi, Kerala, India', '24 June 1974','1995–present', 'upload.wikimedia.org/wikipedia/commons/thumb/c/c4/Madhu_Balakrishnan.jpg/220px-Madhu_Balakrishnan.jpg'

## **இசையமைப்பாளர்களின் பட்டியல்**

In [91]:
response = requests.get(url='https://ta.wikipedia.org/wiki/%E0%AE%87%E0%AE%9A%E0%AF%88%E0%AE%AF%E0%AE%AE%E0%AF%88%E0%AE%AA%E0%AF%8D%E0%AE%AA%E0%AE%BE%E0%AE%B3%E0%AE%B0%E0%AF%8D%E0%AE%95%E0%AE%B3%E0%AE%BF%E0%AE%A9%E0%AF%8D_%E0%AE%AA%E0%AE%9F%E0%AF%8D%E0%AE%9F%E0%AE%BF%E0%AE%AF%E0%AE%B2%E0%AF%8D')
soup = BeautifulSoup(response.content, 'html.parser')

In [92]:
results = soup.find_all('tbody')
out = {}

for r in results:
  rows = r.find_all("tr")
  for row in rows[1:]:
    items = row.find_all("td")
    name = items[0].find('a').string

    href = "https://ta.wikipedia.org"+items[0].find('a')['href']
    if 'redlink' in href:
      href = None

    first_film = items[1].find('a').string
    start_year = items[2].find_all('a')[0].string
    # try:
    #   end_year = items[2].find_all('a')[1].string
    # except IndexError:
    #   end_year = "தற்போது"
    out[name] = {
        'tamil_link': href,
        'முதல் படம்' : first_film,
        'அறிமுக ஆண்டு': start_year
    }
    # out.append([name, href, first_film, start_year])

# out

In [93]:
out['தேவா']['tamil_link'] = 'https://ta.wikipedia.org/wiki/%E0%AE%A4%E0%AF%87%E0%AE%B5%E0%AE%BE_(%E0%AE%87%E0%AE%9A%E0%AF%88%E0%AE%AF%E0%AE%AE%E0%AF%88%E0%AE%AA%E0%AF%8D%E0%AE%AA%E0%AE%BE%E0%AE%B3%E0%AE%B0%E0%AF%8D)'

In [94]:
for key in out.keys():
  if out[key]['tamil_link'] == None:
    out[key]['english_link'], out[key]['intro'], out[key]['content'], out[key]['profession'], out[key]['music_formats'], out[key]['instruments_played'] = None, None, None, [], [], []
  else:
    out[key]['english_link'], out[key]['intro'], out[key]['content'], out[key]['profession'], out[key]['music_formats'], out[key]['instruments_played'] = scrape_tamil_page(out[key]['tamil_link'])

In [95]:
for key in out.keys():
  if out[key]['english_link'] == None:
    out[key]['place_of_birth'], out[key]['date_of_birth'], out[key]['years_active'], out[key]['image'] = None, None, None, None
  else:
    try:
      out[key]['place_of_birth'], out[key]['date_of_birth'], out[key]['years_active'], out[key]['image'] = scrape_english_page(out[key]['english_link'])
    except TypeError:
      print(out[key]['english_link'])

In [96]:
final = dict(x)
final.update(out)

json_list = []
for key in final.keys():
  try:
    json_list.append({
      'பெயர்': key,
      'அறிமுகம்': final[key]['intro'],
      'உள்ளடக்கம்': final[key]['content'],
      'முதல் படம்': final[key]['முதல் படம்'],
      'அறிமுக ஆண்டு': final[key]['அறிமுக ஆண்டு'],
      'பிறந்த திகதி': final[key]['date_of_birth'],
      'பிறப்பிடம்': final[key]['place_of_birth'],
      'செயற்பாட்டுக் காலம்': final[key]['years_active'],
      'தொழில்' : final[key]['profession'],
      'இசை வடிவங்கள்' : final[key]['music_formats'],
      'இசைக்கருவிகள்' : final[key]['instruments_played']
    })
  except KeyError:
    print(key)

for item in json_list:
  dob = get_dob(item['பிறந்த திகதி'])
  if dob != None:
    item['பிறந்த திகதி'] = dob.strftime('%d/%m/%Y')
    item['பிறந்த திகதி தமிழில்'] = dob.strftime('%d %B %Y')
  else:
    item['பிறந்த திகதி தமிழில்'] = None

for item in json_list:
  if item['பிறப்பிடம்'] != None:
    item['பிறப்பிடம்'] = re.sub("[\(\[].*?[\)\]]", "", item['பிறப்பிடம்']).replace(")", "")

In [102]:
len(json_list)

117

In [101]:
with open('data/famous_people_raw_final.json', 'w', encoding='utf-8') as f:
  json.dump(json_list, f, ensure_ascii=False, indent=4)