In [39]:
#THERESA BRONS, MARCH 2021
import pandas as pd
import datetime
from datetime import date, timedelta
from collections import defaultdict
from dateutil import relativedelta
import os
import glob
from authorize_creds import authorize_creds

sc_site = SITE_URL
num_days = 481                    # Number of Days, Months to Extract

creds = 'client_secret_desktop.json'   
output = 'gsc_data'
project_name = PROJECT_NAME

search_types = ['web','image','video']

#storage folder is where you'll get dates for the data you already are storing.  DO NOT INCLUDE SEARCH TYPE!
#i keep all the search type data in the same folder
STORAGE_FOLDER = '' #IF YOU CURRENTLY STORE STUFF IN CSV, YOU CAN READ DATES YOU'VE ALREADY GATHERED DATA FOR
#output folder will usually be same as storage folder
OUTPUT_FOLDER = STORAGE_FOLDER


        

# Create Function to execute your API Request
def execute_request(service, property_uri, request):
    return service.searchanalytics().query(siteUrl=property_uri, body=request).execute()
 
# Create function to write to CSV
def write_to_csv(data,filename):
    if not os.path.isfile(filename):
        data.to_csv(filename, index=False)
    else: # else it exists so append without writing the header
        data.to_csv(filename, mode='a', header=False, index=False)
        
def get_files(path):
    #This returns a complete list of csv files within the given subdirectory.  
    #It prepends the subdirectory name to each filename.
    cwd = os.getcwd()
    os.chdir(path)
    
    files=[]
    for filename in glob.glob('*.csv'):
        files.append(os.getcwd() + '/' + filename)
    os.chdir(cwd)
    return files
        
# Read CSV if it exists to find dates that have already been processed.
def get_dates_from_csv(path):
    
    #grab list of file names, full path
    dates = set( [file[-14:-4] for file in get_files(path)] )
    
    return dates

 
    
# Create function to extract all the data
def granular_extract(creds, num_days, search_type, storage_folder = STORAGE_FOLDER, output_folder = OUTPUT_FOLDER):
    full_path = storage_folder + '/' + search_type + '/' 
    
    current_dates = get_dates_from_csv(full_path)

    webmasters_service = authorize_creds()
 
    # Set up Dates
    end_date = datetime.date.today() - relativedelta.relativedelta(days=3)
    start_date = end_date - relativedelta.relativedelta(days=num_days)
   
    
    delta = datetime.timedelta(days=1) # This will let us loop one day at the time
    
 

    while start_date <= end_date:
        iteration_num=0

        scDict = defaultdict(list)

        if current_dates is not None and current_dates.issuperset([datetime.datetime.strftime(start_date,'%Y-%m-%d')]):
            print('Existing Date: %s' % start_date)
            start_date += delta     
        else:
            print('Start date at beginning: %s' % start_date)
     
            maxRows = 25000 # Maximum 25K per call 
            numRows = 0     # Start at Row Zero
            status = ''     # Initialize status of extraction
 
            while (status != 'Finished') : # Test with i < 10 just to see how long the task will take to process.
                request = {
                    'startDate': datetime.datetime.strftime(start_date,'%Y-%m-%d'),
                    'endDate': datetime.datetime.strftime(start_date,'%Y-%m-%d'),
                    'dimensions': ['date','page','query']#,'device','country'],
                    'rowLimit': maxRows,
                    'searchType': search_type,
                    'startRow': numRows
                }

                response = execute_request(webmasters_service, sc_site , request)
                
                try:
                #Process the response
                
                # ok scDict is having shit appended to it....I think the df is being redefined each loop
                # but the scDict is still growing so it's not noticed.
                    for row in response['rows']:
                        scDict['date'].append(row['keys'][0] or 0) 
                        scDict['page'].append(row['keys'][1] or 0)
                        scDict['query'].append(row['keys'][2] or 0)
                        scDict['clicks'].append(row['clicks'] or 0)
                        scDict['ctr'].append(row['ctr'] or 0)
                        scDict['impressions'].append(row['impressions'] or 0)
                        scDict['position'].append(row['position'] or 0)
                        #scDict['device'].append(row['keys'][3] or 0)
                        #scDict['country'].append(row['keys'][4] or 0)                        
                    
                except:
                    print('error occurred at %i' % numRows)
 
                #Add response to dataframe 
                df = pd.DataFrame(data = scDict)
                df['clicks'] = df['clicks'].astype('int')
                df['ctr'] = df['ctr']*100
                df['impressions'] = df['impressions'].astype('int')
                df['position'] = df['position'].round(2)
                print('writing file: ' + full_path + '_' + str(start_date) + '.csv')
                try:
                    print('writing to csv')
                    
                    write_to_csv(df, full_path  + 'GSC_'+search_type +'_'+ str(iteration_num) + '_' + str(start_date) + '.csv')
                    iteration_num+=1
                except Exception as e:
                    print(full_path + 'GSC_' + search_type +'_' + str(iteration_num) + '_' + str(start_date) + '.csv')
                    print(e)
                    return               
                
                print('Numrows at the start of loop: %i' % numRows)
                try: 
                    numRows = numRows + len(response['rows'])
                except:
                    status = 'Finished'
                print('Numrows at the end of loop: %i' % numRows)
                if numRows % maxRows != 0:
                    status = 'Finished'
         
            start_date += delta  
            print('Start date at end: %s' % start_date) 
            #write_to_csv(df,full_path)
    return



Existing Date: 2019-11-25
Existing Date: 2019-11-26
Existing Date: 2019-11-27
Existing Date: 2019-11-28
Existing Date: 2019-11-29
Existing Date: 2019-11-30
Existing Date: 2019-12-01
Existing Date: 2019-12-02
Existing Date: 2019-12-03
Existing Date: 2019-12-04
Existing Date: 2019-12-05
Existing Date: 2019-12-06
Existing Date: 2019-12-07
Existing Date: 2019-12-08
Existing Date: 2019-12-09
Existing Date: 2019-12-10
Existing Date: 2019-12-11
Existing Date: 2019-12-12
Existing Date: 2019-12-13
Existing Date: 2019-12-14
Existing Date: 2019-12-15
Existing Date: 2019-12-16
Existing Date: 2019-12-17
Existing Date: 2019-12-18
Existing Date: 2019-12-19
Existing Date: 2019-12-20
Existing Date: 2019-12-21
Existing Date: 2019-12-22
Existing Date: 2019-12-23
Existing Date: 2019-12-24
Existing Date: 2019-12-25
Existing Date: 2019-12-26
Existing Date: 2019-12-27
Existing Date: 2019-12-28
Existing Date: 2019-12-29
Existing Date: 2019-12-30
Existing Date: 2019-12-31
Existing Date: 2020-01-01
Existing Dat

writing file: C:/Users/there/Anaconda3/WhoAreYouMadeOf/granular_data/web/_2021-03-19.csv
writing to csv
Numrows at the start of loop: 0
Numrows at the end of loop: 7850
Start date at end: 2021-03-20
Start date at beginning: 2021-03-20
writing file: C:/Users/there/Anaconda3/WhoAreYouMadeOf/granular_data/web/_2021-03-20.csv
writing to csv
Numrows at the start of loop: 0
Numrows at the end of loop: 8057
Start date at end: 2021-03-21
Existing Date: 2019-11-25
Existing Date: 2019-11-26
Existing Date: 2019-11-27
Existing Date: 2019-11-28
Existing Date: 2019-11-29
Existing Date: 2019-11-30
Existing Date: 2019-12-01
Existing Date: 2019-12-02
Existing Date: 2019-12-03
Existing Date: 2019-12-04
Existing Date: 2019-12-05
Existing Date: 2019-12-06
Existing Date: 2019-12-07
Existing Date: 2019-12-08
Existing Date: 2019-12-09
Existing Date: 2019-12-10
Existing Date: 2019-12-11
Existing Date: 2019-12-12
Existing Date: 2019-12-13
Existing Date: 2019-12-14
Existing Date: 2019-12-15
Existing Date: 2019-

writing file: C:/Users/there/Anaconda3/WhoAreYouMadeOf/granular_data/image/_2021-03-20.csv
writing to csv
Numrows at the start of loop: 0
Numrows at the end of loop: 3842
Start date at end: 2021-03-21
Existing Date: 2019-11-25
Existing Date: 2019-11-26
Existing Date: 2019-11-27
Existing Date: 2019-11-28
Existing Date: 2019-11-29
Existing Date: 2019-11-30
Existing Date: 2019-12-01
Existing Date: 2019-12-02
Existing Date: 2019-12-03
Existing Date: 2019-12-04
Existing Date: 2019-12-05
Existing Date: 2019-12-06
Existing Date: 2019-12-07
Existing Date: 2019-12-08
Existing Date: 2019-12-09
Existing Date: 2019-12-10
Existing Date: 2019-12-11
Existing Date: 2019-12-12
Existing Date: 2019-12-13
Existing Date: 2019-12-14
Existing Date: 2019-12-15
Existing Date: 2019-12-16
Existing Date: 2019-12-17
Existing Date: 2019-12-18
Existing Date: 2019-12-19
Existing Date: 2019-12-20
Existing Date: 2019-12-21
Existing Date: 2019-12-22
Existing Date: 2019-12-23
Existing Date: 2019-12-24
Existing Date: 2019

writing file: C:/Users/there/Anaconda3/WhoAreYouMadeOf/granular_data/video/_2021-03-20.csv
writing to csv
Numrows at the start of loop: 0
Numrows at the end of loop: 65
Start date at end: 2021-03-21
