In [None]:
'''
In this problem, you will capture data that includes unstructured text and you will serve as a consultant analyzing this unstructured data using Natural Language Processing tools.
There are many choices as to how you may do this. 

Minimum specifications:
Identify a company to serve in this assignment. Their data should be part of the dataset you capture. Either...
Find an API for download, or... 

A website where you can scrape a significant number of records (say 500 scraping, 1000 API) of text and other data about your company/product/service.
Join at least two different datasets together (these could be multiple pages of web scraping or separate API requests for a product versus its competitor

Create an analysis of unstructured data for your company/product and competitors using Python:

Help us to understand what is being said about your company/product in the unstructured data (I will demo both API and screen scraping).
Use natural language processing transformers to better access the MEANING of what the users are saying (ie. Word2Vec or Doc2Vec models).
Create visuals and write explanations in a professional report that shows your findings.
'''
import pandas as pd
import matplotlib.pyplot as plt
import os
import statsmodels.api as sm
from googleapiclient.discovery import build


# Manage workspaces via Git
current_directory = os.getcwd()
directory = ""
if current_directory[9:14] == 'vgwis':
    directory = "C:/Users/vgwis/Documents/Bana780/"
else:
    directory = "C:/Users/vgwcto/Documents/Python/Bana780/"

# Store API key locally
api_key = pd.read_csv(directory + 'key.csv').columns[0]

# Build a Function to easily search new ideas
def multiple_page_search(api_key, query, max_results = 50, max_pages = 10, cache_file = None):
    
    # Check if cache exists
    if cache_file and os.path.exists(cache_file):
        return pd.read_csv(cache_file)

    # Initialize the YouTube API client, variables
    youtube = build('youtube', 'v3', developerKey = api_key)
    descriptions = []
    next_page_token = None
    page_count = 0

    # cONSTRUCT the API request
    while page_count < max_pages:
        search = youtube.search().list(
            q = query,
            part = 'snippet',
            type = 'video',
            maxResults = max_results,
            pageToken = next_page_token
        )
        response = search.execute()

        # Collect video IDs from the response to retrieve full descriptions
        video_ids = [item['id']['videoId'] for item in response['items']]

        video_request = youtube.videos().list(
            part = 'snippet',
            id = ','.join(video_ids)
        )
        video_response = video_request.execute()

        for item in video_response['items']:
            desc = item['snippet']['description']
            descriptions.append(desc)

        # Increment the page count
        next_page_token = response.get('nextPageToken')
        page_count += 1

    # Create a DataFrame from the collected descriptions, save to cache file
    df = pd.DataFrame(descriptions, columns = ['Description'])

    if cache_file:
        df.to_csv(cache_file, index = False)

    return df

A_cache_file = directory + 'A_cache.csv'
B_cache_file = directory + 'B_cache.csv'

A_df = multiple_page_search(api_key, 'Learn to Speak Mandarin', cache_file = A_cache_file)
B_df = multiple_page_search(api_key, 'Learn to Speak Spanish', cache_file = B_cache_file)

merged = pd.DataFrame({
    'Mandarin': A_df['Description'],
    'Spanish': B_df['Description']
})

print(merged['Spanish'][1])
