In [1]:
import requests 
from bs4 import BeautifulSoup 
from datetime import date, datetime
import pandas as pd
from openpyxl import load_workbook
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows

In [2]:
vr_snapshot_path = 'https://www.valueresearchonline.com/funds/newsnapshot.asp?schemecode='
vr_performance_path = 'https://www.valueresearchonline.com/funds/fundperformance.asp?schemecode='

details_file_path = 'C:/Prajeen/My Docs/Equity/1.Equity Details/1.Fund Analysis/Fund Details/'

masterList = '1.fund_master_list.xlsx'
graphExcel = '2.Weekly fund Graph.xlsx'
dFormat = '%d/%m/%Y'

In [3]:
def readExcel(excel):
    print(excel)
    master_wb = load_workbook(details_file_path+excel)
    fundTypes = {}
    sheets = master_wb.get_sheet_names()
    for sheet in sheets:
        fundTypes[sheet] = master_wb[sheet]
    return fundTypes, master_wb

In [4]:
def getDfFromXl(excel):
    headers = list(excel.values)[0]
    content = list(excel.values)[1:]
    return pd.DataFrame(data=content,columns=headers)

In [5]:
def returnFloat(contStr, toReplace=''):
    if contStr == '-':
        contStr = '0'
    try:
        val = float(contStr.replace(toReplace,''))
    except ValueError:
        return contStr
    
    return val

In [6]:
def getValFromTable(column):
    columnVal = list(column.stripped_strings)[0]
    return columnVal

In [7]:
def scrapeSnapshot(fundCode):
    sShotResult = requests.get(vr_snapshot_path+fundCode)
    parsedSS = BeautifulSoup(sShotResult.content, 'lxml')
    snapContents = {}
    
    fundName = parsedSS.find('h1', class_='snapshot-fund-name')
    snapContents['Fund Name'] = list(fundName.stripped_strings)[0]

    baseInfo = parsedSS.find('div', class_ = 'pull-left fundHeadRight')
    baseInfoStr = list(baseInfo.stripped_strings)
    snapContents['Category'] = baseInfoStr[1]

    dateInfo = parsedSS.find('div', class_ = 'pull-left change-date')
    dateStr = list(dateInfo.stripped_strings)
    snapContents['Date'] = (dateStr[0].split('as on'))[1].strip()
    
    return snapContents

In [8]:
def scrapePerformance(fundCode):
    performResult = requests.get(vr_performance_path+fundCode)
    parsedPerf = BeautifulSoup(performResult.content, 'lxml')
    
    perfContents = {}
    perfInfo = parsedPerf.find_all('div', class_ = 'pull-left sectionHead margin_top15px')
    
    
    riskMeasTable = perfInfo[3].find_all('tr')
    val = list(riskMeasTable[0].find('th').stripped_strings)[0]
    
    trailRetTable=[]
    if val.startswith('Trailing Returns'):
        trailRetTable = riskMeasTable
    else:
        trailRetTable = perfInfo[4].find_all('tr')
    
    returnsInfo = trailRetTable[1].find_all('td')
    perfContents['1W Ret'] = returnFloat(getValFromTable(returnsInfo[3]))
    perfContents['1M Ret'] = returnFloat(getValFromTable(returnsInfo[4]))
    perfContents['3M Ret'] = returnFloat(getValFromTable(returnsInfo[5]))
    perfContents['6M Ret'] = returnFloat(getValFromTable(returnsInfo[6]))

    return perfContents

In [9]:
def createFundLine(fundColumns, fundDict):
    line = []
    for col in fundColumns:
        line.append(fundDict[col])
    return line

In [10]:
def getToday():
    return date.today().strftime(dFormat)

In [11]:
def isDailyScrapingDone(sheetDf):
    dateVal = sheetDf['Date']
    latestDate = dateVal[len(dateVal)-1]
      
    if type(latestDate) is pd.tslib.Timestamp:
        if latestDate.strftime(dFormat) == getToday():
            return True
    elif type(latestDate) is str:
        if datetime.strptime(latestDate, dFormat).strftime(dFormat) == getToday():
            return True   
    elif type(latestDate) is pd._libs.tslib.Timestamp:
        if latestDate.to_pydatetime().strftime(dFormat) == getToday():
            return True  
        
    return False
    

In [12]:
print(str(datetime.now()))

fundTypes, masterFile = readExcel(masterList)
graphDetails, graphFile = readExcel(graphExcel)
evalSheet = graphDetails['evaluation']

for sheet in list(fundTypes.keys()):
    print('scraping '+ sheet)
    sheetDf = getDfFromXl(fundTypes[sheet])
    
    if isDailyScrapingDone(getDfFromXl(graphDetails[sheet])):
        continue
    fundLine = []
    isDateUpdated = False
    
    for fundCode in sheetDf['Fund Code'].values:
        print('scraping fund '+ str(fundCode))
        fundDict = {}
        evalLine =[]
        
        fundDict.update(scrapeSnapshot(str(fundCode)))
        fundDict.update(scrapePerformance(str(fundCode)))
        
        if isDateUpdated is False:
            fundLine.append(getToday())
            isDateUpdated = True
          
        fundLine.append(round(fundDict['1W Ret'],1))
        fundLine.append(round(fundDict['1M Ret'],1))
        fundLine.append(round(fundDict['3M Ret'],1))
        fundLine.append(round(fundDict['6M Ret'],1))
        
        evalLine.append(sheet)
        evalLine.append(getToday())
        evalLine.append(fundDict['Category'])
        evalLine.append(fundDict['Date'])
        evalLine.append(fundDict['1W Ret'])
        evalLine.append(fundDict['1M Ret'])
        evalLine.append(fundDict['3M Ret'])
        evalLine.append(fundDict['6M Ret'])
        
        evalSheet.append(evalLine)
    
    fundDetails = graphDetails[sheet]
    fundDetails.append(fundLine)
    
graphFile.save(details_file_path+graphExcel)


print(str(datetime.now()))
    

2018-12-03 12:43:35.110875
1.fund_master_list.xlsx
2.Weekly fund Graph.xlsx
scraping global
scraping fund 17132


You can access Timestamp as pandas.Timestamp


scraping fund 17439
scraping fund 17042
scraping fund 17131
scraping fund 17021
scraping fund 16027
scraping fund 12498
scraping fund 17012
scraping large & mid
scraping fund 16573
scraping fund 16581
scraping fund 15845
scraping fund 16194
scraping fund 16770
scraping fund 16266
scraping fund 16267
scraping fund 16569
scraping multi
scraping fund 15990
scraping fund 16580
scraping fund 16507
scraping fund 34711
scraping fund 15684
scraping fund 16324
scraping fund 25144
scraping fund 17140
scraping fund 16575
scraping fund 15714
scraping tech
scraping fund 16018
scraping fund 31368
scraping fund 15878
scraping fund 17526
scraping fund 15864
scraping pharma
scraping fund 16150
scraping fund 31352
scraping fund 15932
scraping fund 16327
scraping value
scraping fund 30921
scraping fund 16175
scraping fund 16498
scraping fund 15879
scraping fund 16341
scraping fund 16952
scraping fund 15989
scraping fund 16719
scraping elss
scraping fund 16200
scraping fund 16739
scraping fund 16493
scrap