In [24]:
from lxml import etree
import urllib3
import requests
import re
import pandas as pd
import numpy as np
import io
import time
from selenium import webdriver
from selenium.webdriver.common.by import By

url = "https://www.rbi.org.in/scripts/Annualpolicy.aspx"
path = r'C://Users//raina//Dropbox//Chengjun//Text Analysis//'

In [25]:
source_dict = np.load(path+'yearly_page_source.npy',allow_pickle=True).item()

In [26]:
header = {
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Mobile Safari/537.36",
    "Connection": "keep-alive",
    "Accept": "*/*",
    "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7"}

def get_html(header,url):
    r = requests.get(url)
    print(r)
    r_text = r.text
    html=etree.HTML(r_text)
    return html

In [27]:
def help_filter(s):
    return re.sub(r'[^a-zA-Z0-9]', '', s)

def get_file_info(html,path):
    file_info = html.xpath(path)

    file_info = list(filter(lambda x: help_filter(x) != "",file_info))
    file_info = [re.sub(r' +', ' ', x) for x in file_info ]
    target = ["Press Release","Full Document","Governor's Press Statement"]
    count = 0
    IsPreIn = 0
    new_file_info = []
    while count<len(file_info):
        file_info[count] = file_info[count].strip(" ")
        # cur in target
        if file_info[count] in target:
            # pre not in target
            if IsPreIn==0:
                new_file_info = new_file_info[:-1]
                new_file_info.append(file_info[count-1] +"-"+ file_info[count])
            # pre in target
            else:
                new_file_info.append(file_info[count-2] +"-"+ file_info[count])
        else:
            new_file_info.append(file_info[count])
        
        count+=1
    return new_file_info

In [28]:
def get_file_dict(file_info):
    data_dict = {
        "date":[],
        "title":[],
        "file_name":[]
    }
    date_pos = [i for i,info in enumerate(file_info) if len(info) == 12]
    
    
    for i in range(len(date_pos)-1):
        files_num = (date_pos[i+1]-date_pos[i])-2
        data_dict['date']+= [file_info[date_pos[i]]]*files_num
        data_dict['title']+= [file_info[date_pos[i]+1]]*files_num
        data_dict['file_name']+= file_info[date_pos[i]+2:date_pos[i]+2+files_num]

    files_num = (len(file_info)-date_pos[-1])-2
    data_dict['date']+= [file_info[date_pos[-1]]]*files_num
    data_dict['title']+= [file_info[date_pos[-1]+1]]*files_num
    data_dict['file_name']+= file_info[date_pos[-1]+2:]

    return data_dict

In [49]:
def concat_link(s):
    if "http" not in s.lower():
        if s[0].islower():
            return "https://www.rbi.org.in/Scripts/"+s
        else:
            return "https://www.rbi.org.in/scripts/"+s
    return s

def get_script_list(year_id):
    #source_dict['2023-2024']
    html = etree.HTML(source_dict[year_id])
    file_info = get_file_info(html,'//*[@class="brd-ptable"]//td//text()')
    df = pd.DataFrame(get_file_dict(file_info))

    link = html.xpath('//*//td//a//@href')
    link = link[:-1]
    link = link+[None]*(len(df)-len(link))

    df['file_link'] = link
    df['file_link'] = df['file_link'].apply(lambda s:concat_link(s))
    df['isScript'] = df['file_link'].apply(lambda s: 0 if (s == None) or ("script" not in s.lower()) else 1)

    df_script = df[df.isScript == 1].reset_index(drop = True)

    df_script.to_csv(path+"script_list//{}.csv".format(year_id),index=0)
    
    return df

In [52]:


# Get all script lists and save as csv
year_id = "2014-2015"
temp = get_script_list(year_id)

temp

ValueError: Length of values (38) does not match length of index (36)

In [174]:
#get_script_list("2013-2014")
year_id = "2005-2006"
html = etree.HTML(source_dict[year_id])
file_info = get_file_info(html,'//*[@class="brd-ptable"]//td//text()')
df = pd.DataFrame(get_file_dict(file_info))

df

Unnamed: 0,date,title,file_name
0,"Jan 24, 2006",Third Quarter Review of Annual Monetary Policy...,"Statement by Dr. Y. Venugopal Reddy, Governor,..."
1,"Jan 24, 2006",Third Quarter Review of Annual Monetary Policy...,Full Statement
2,"Jan 24, 2006",Third Quarter Review of Annual Monetary Policy...,Webcasting of Governor's Press Conference
3,"Jan 23, 2006",Macroeconomic and Monetary Developments - Thir...,Press Release-Full Document
4,"Oct 25, 2005",Mid-Term Review of Annual Policy Statement 200...,RBI Governor announces Mid-Term Review of Annu...
5,"Oct 25, 2005",Mid-Term Review of Annual Policy Statement 200...,Full Statement
6,"Oct 25, 2005",Mid-Term Review of Annual Policy Statement 200...,Webcasting of Governor's Press Conference
7,"Oct 24, 2005",Macroeconomic and Monetary Developments - Mid ...,Press Release-Full Document
8,"Jul 26, 2005",First Quarter Review of Annual Statement on Mo...,First Quarter Review of Annual Statement on Mo...
9,"Jul 26, 2005",First Quarter Review of Annual Statement on Mo...,Full Statement


In [175]:

link = html.xpath('//*//td//a//@href')
link = link[:-1]
link = link+[None]*(len(df)-len(link))
len(link)

17

# Download

In [8]:
def download_pdf(header,save_path,pdf_name,pdf_url):
    response = requests.get(pdf_url, headers=header)
    bytes_io = io.BytesIO(response.content)
    with open(save_path + "%s.PDF" % pdf_name, mode='wb') as f:
        f.write(bytes_io.getvalue())
        print('%s.PDF, Download Successed！' % (pdf_name))

In [9]:
def download_txt(text,txt_name,txt_path):
    with open(txt_path+txt_name,'w') as f:
        f.write(str(text))
    print('%s.Text, Download Successed！' % (txt_name))

In [196]:
ID = "2000-2002"

In [200]:
df = pd.read_csv("C://Users//raina//Dropbox//Chengjun//Text Analysis//script_list//%s.csv"%ID)
for i in range(4,len(df)):
    print(i+1)
    file_url = df['file_link'][i]
    file_html = get_html(header,file_url)
    
    file_title_info = file_html.xpath('//*[@class="tableheader"]//text()')
    file_content = file_html.xpath('//*[@class="tablecontent1"]//p//text()')
    pdf_url = file_html.xpath('//*[@class="tableheader"]//a//@href')[0]
    
    path = r'C://Users//raina//Dropbox//Chengjun//Text Analysis//download_scrips//%s//'%ID
    
    name= str(i+1)+"-["+df['date'][i]+"-"+df['title'][i]+"]"+" "+df['file_name'][i]
    
    if len(name)>170:
        name = name[:-(len(name)-170)]
        
        
    download_pdf(header,path, name, pdf_url = pdf_url )
    #download_txt(file_content,name,path)

5
<Response [200]>
5-[Apr 19, 2001-Macroeconomic and Monetary Developments in 2000-2001] Macroeconomic and Monetary Developments in 2000-2001-Press Release.PDF, Download Successed！
6
<Response [200]>


IndexError: list index out of range

In [None]:
len(df)