In [13]:
import requests as req
from bs4 import BeautifulSoup
import time
from datetime import datetime

import pandas as pd
import csv

In [2]:
def request_url():
    ''' get the responses from the concated urls
    
    returns:
        return the responses from the url,list
    '''
    
    ##concate the url to scrape
    responses = []
    url_port = 'http://kaijiang.500.com/shtml/ssq/'
    url_query = '.shtml?0_ala_baidu'
    minimum_weeks = 149
    from_year = '2005' ## 2 digits year
    for year in range(int(from_year),2020):
        for period in range(1,minimum_weeks):
            year_str = str(year)[-2:]
            if period < 10: ## period number <10
                period_str = '00'+str(period)
            elif period <100: ## period number <100
                period_str = '0' + str(period)
            else:            ## period number > 100
                period_str = str(period)
            ##concate the url
            url = url_port + year_str + period_str + url_query
            ## make the request
            time.sleep(5) ## slepp for a while
            try:
                yield req.get(url) ## make the request and have it back as an obj
                
            except :
                print('something wrong with connection')
#         print(f'{year} has been requested')

#     return responses

In [18]:
def scrapy_data(responses):
    '''scrapy data and yield a item
    
    Args:
        request response object
    
    Return:
        A well set of dictionary value container,dictionary
        
    Raise:
    
    '''
    redbals = [] ## container to save the ball
    
    for res in responses:
        ##decode the byte content in a manner of chinese reconginzable
        res_text = res.content.decode('gb18030','ignore')
        soup = BeautifulSoup(res_text,'html.parser') ## parse the text to bs object
        ## get the red ball and blue ball
        balls_red = soup.find_all('li',attrs = {'class': 'ball_red'})
        for ball_red in balls_red:
            number = ball_red.text
            redbals.append(number)
        ball_blue = soup.find('li',attrs = {'class':'ball_blue'}).text
        ## get the date
        date = soup.find('td',attrs ={'class':'td_title01'}) ## father tag
        date_text = date.find('span',attrs ={'class':'span_right'}).text ## text
        try: ##decoding error might occur
            open_date = date_text.split(' ')[0].split('：')[-1].replace('年','/').replace('月','/').replace('日','')
            close_dat = date_text.split('：')[-1].replace('年','/').replace('月','/').replace('日','')
            ## get money spend or deposited
            money = soup.find_all('span',attrs ={'class':'cfont1'})
            money_spend = money[0].text.replace('元','').replace(',','') ##spended
            money_deposit= money[1].text.replace('元','').replace(',','') ## deposited
        except ValueError as e:
            print(e)
        ## collect the data inside a dictionary
        finally:
            yield {
                'red_balls':redbals,
                'blue_balls':ball_blue,
                'open_date':open_date,
                'money_spend':money_spend,
                'money_deposit':money_deposit
            }
        
        

In [4]:
# def add_data_tog(data_dicts):
#     '''save data to the list
#     '''
    
#     return [data for data in data_dicts]


In [5]:
def data_cleaner(data_dict):
    '''
    preprocess the dict in a manner of
    to generate a df column as
    column names as =['year','month','weekday','day','money_spend','money_deposit']
    
    Args:
        data_dict, dictionary
    
    Returns:
        dataframe, as mentioned column names
    
    Raise: 
        KeyError
    '''
    defined_cols = ['year','month','weekday','day','money_spend','money_deposit','balls']
    cleaned_data = pd.DataFrame([data_dict])
    ##conver 'open_date' to datatime object
    datetime_obj = cleaned_data['open_date'].apply(lambda x:datetime.strptime(x,'%Y/%m/%d'))
    ##get the year,month,day,weekday
    cleaned_data['year'] = [date.year for date in datetime_obj]
    cleaned_data['month'] = [date.month for date in datetime_obj]
    cleaned_data['day'] = [date.day for date in datetime_obj]
    cleaned_data['weekday'] = [date.weekday() for date in datetime_obj]
    ##get the balls
    cleaned_data['balls'] = cleaned_data['red_balls']
    ## slice the dataframe
    cleaned_data = cleaned_data[defined_cols]
    
    return cleaned_data

In [None]:
def data_preprocess(jakpot_number):
    
    '''function to scrape the previous double
    color ball and in a featuring cols 
    and plot the ball number
    '''
    url = 'http://kaijiang.500.com/shtml/ssq/18148.shtml?0_ala_baidu'
    responses = request_url() ##get the url request generator
    data_generator = scrapy_data(responses) ## scrapy the data
    file_name = 'numbers.csv' ##file name to save data
    for data in data_generator:
#         print(data)
        cleaned_data = data_cleaner(data)
      
        
        with open(file_name, 'w+') as csv_file: #save data to a csv file..
            csv_writer = csv.writer(csv_file, delimiter=',')
            csv_writer.writerow(cleaned_data)
#         jakpot_number.append(cleaned_data)
#         break
#     jakpot_number = add_data_tog(data_generator) ## save data to jakpot number
#     jakpot_ax = plot_jakpot_agg(jakpot_agg) ## shown data with such plot
#     jakpot_train,jakpot_valid,jakpot_test = preprocess(jakpot_agg) ## in a manner of preprocess,year,month,week and day
#     jakpot_model = model_fit(jakpot_train,jakpot_valid) ## feed data to a neural network model and validated
#     jakpot_model_evlued = jakpot_model_eva(jakpot_test) ## test data on the finally model

In [8]:
def plot_scatter(mask_data):
    '''
    plot the data in a scatter way
    '''
    pass

In [9]:
def plot_hist(mask_data):
    '''
        plot the data in a histgram
    '''
    pass

In [10]:
def plot_pie(mask_data):
    '''
        plot the data in a pie
    '''
    pass

In [11]:
def plot_balls(ball_dataframe,start_date,end_date):
    '''
    select start,year,month,day to show inside the plot
    such plot can be displayed, pieplot,histogram,scatter plot
    and so on
    
    Args:
        couple of tuples for start_date and end_date in a sequence(year,month,day)
        
    Returns:
        matplotlib ax saved inside a dictionary,Axes
    
    Raise:
        ValueError occured while end year is smaller then start year
    '''
    ax_plot = {} ##plot saving dictionary
    ## mask the dataframe according to the date 
    mask_start = (ball_dataframe['year']>year & ball_dataframe['month']>month & ball_dataframe['day']>day)
    mask_end = (ball_dataframe['year']<year & ball_dataframe['month']<month & ball_dataframe['day']<day)
    if year > end_year:  ##check data sequence
        raise ValueError(f'year{year} must be smaller than end year {end_year}')
    else:
        mask = mask_start & mask_end ## have the overlapped mask
        if mask.any():
            mask_data = ball_dataframe[mask] ## get the data
            ax_pie = plot_pie(mask_data) ## plot the pie ball
            ax_hist = plot_hist(mask_data) ## plot the histgram
            ax_scatter = plot_scatter(mask_data) ## plot the scatter 
            ax_plot['ax_pie'] = ax_pie ##save plot to dict
            ax_plot['ax_hist'] = ax_hist ##save plot to dict
            ax_plot['ax_scatter'] = ax_scatter ##save plot to dict
        else:
            print('Sorry,there is no data between your selection year')
    
    return ax_plot

In [19]:
def main():
    ##
    jakpot_number = []##in such a dict, peirod:{data:'',reward_num:'','deposited_from_last_period'}
    data_preprocess(jakpot_number) #precedding the data preprocess
    plot_balls(ball_dataframe,star_date,end_date) ## plot data in a manner
    

In [None]:
main()

In [None]:
if __name__ == '__main__':
    main()

something wrong with connection


Exception ignored in: <generator object request_url at 0x7ff8ad57ccf0>
RuntimeError: generator ignored GeneratorExit


something wrong with connection
something wrong with connection
something wrong with connection
something wrong with connection
something wrong with connection
