# **INFO5731 Assignment 2**

In this assignment, you will work on gathering text data from an open data source via web scraping or API. Following this, you will need to clean the text data and perform syntactic analysis on the data. Follow the instructions carefully and design well-structured Python programs to address each question.

**Expectations**:
*   Use the provided .*ipynb* document to write your code & respond to the questions. Avoid generating a new file.
*   Write complete answers and run all the cells before submission.
*   Make sure the submission is "clean"; *i.e.*, no unnecessary code cells.
*   Once finished, allow shared rights from top right corner (*see Canvas for details*).

* **Make sure to submit the cleaned data CSV in the comment section - 10 points**


# Question 1 (25 points)

Write a python program to collect text data from **either of the following sources** and save the data into a **csv file:**

(1) Collect all the customer reviews of a product (you can choose any porduct) on amazon. [atleast 1000 reviews]

(2) Collect the top 1000 User Reviews of a movie recently in 2023 or 2024 (you can choose any movie) from IMDB. [If one movie doesn't have sufficient reviews, collect reviews of atleast 2 or 3 movies]


(3) Collect the **abstracts** of the top 10000 research papers by using the query "machine learning", "data science", "artifical intelligence", or "information extraction" from Semantic Scholar.

(4) Collect all the information of the 904 narrators in the Densho Digital Repository.

(5)**Collect a total of 10000 reviews** of the top 100 most popular software from G2 and Capterra.


In [16]:
# First install required packages
!apt-get update
!apt install chromium-chromium-driver
!pip install selenium
!pip install webdriver_manager

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException

import pandas as pd
import time
from webdriver_manager.chrome import ChromeDriverManager

def scrape_imdb_reviews(url, max_reviews=1000):
    # Configure Chrome options for Colab environment
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.6943.98 "
    )

    # Initialize webdriver for Colab
    driver = webdriver.Chrome(options=chrome_options)

    all_reviews = []

    try:
        driver.get(url)
        # Wait for the initial review articles to load
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "article")))

        # Click the "All" dropdown button if available to load full review details
        try:
            all_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'ipc-see-more__button')]"))
            )
            driver.execute_script("arguments[0].scrollIntoView(true);", all_button)
            time.sleep(0.5)
            all_button.click()
            time.sleep(2)
        except (TimeoutException, ElementClickInterceptedException) as e:
            print(f"Issue with 'All' button: {str(e)}. Continuing without clicking it.")

        while len(all_reviews) < max_reviews:
            # Scroll to bottom to prompt lazy-loading of reviews
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)

            # Get current review articles
            review_elements = driver.find_elements(By.TAG_NAME, "article")
            current_count = len(review_elements)

            for review in review_elements:
                if len(all_reviews) >= max_reviews:
                    break
                try:
                    # Get rating
                    try:
                        rating = review.find_element(By.CLASS_NAME, "ipc-rating-star--rating").text
                    except NoSuchElementException:
                        rating = None

                    # Get title
                    try:
                        title = review.find_element(By.CLASS_NAME, "ipc-title__text").text.replace(" \nExpand", "")
                    except Exception:
                        title = ""

                    # Handle spoiler content
                    try:
                        spoiler_button = review.find_element(By.CLASS_NAME, "review-spoiler-button")
                        driver.execute_script("arguments[0].scrollIntoView(true);", spoiler_button)
                        time.sleep(0.2)
                        spoiler_button.click()
                        time.sleep(0.5)
                    except Exception:
                        pass

                    # Get author and date
                    try:
                        author_info = review.find_element(By.CLASS_NAME, "iHZNcU")
                        author = author_info.find_element(By.CLASS_NAME, "ipc-link").text
                        date = author_info.find_element(By.CLASS_NAME, "review-date").text
                    except Exception:
                        author = ""
                        date = ""

                    # Get helpful votes
                    try:
                        helpful_votes = review.find_element(By.CLASS_NAME, "ipc-voting__label__count--up").text
                    except NoSuchElementException:
                        helpful_votes = ""

                    review_data = {
                        'rating': rating,
                        'title': title,
                        'author': author,
                        'date': date,
                        'helpful_votes': helpful_votes
                    }

                    # Avoid duplicates
                    if review_data not in all_reviews:
                        all_reviews.append(review_data)
                        print(f"Scraped review {len(all_reviews)}: {title[:50]}...")

                except Exception as e:
                    print(f"Error scraping a review: {str(e)}")
                    continue

            if len(all_reviews) >= max_reviews:
                break

            # Load more reviews
            try:
                load_more = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'ipc-see-more__button')]"))
                )
                driver.execute_script("arguments[0].scrollIntoView(true);", load_more)
                time.sleep(0.5)
                driver.execute_script("arguments[0].click();", load_more)
                WebDriverWait(driver, 10).until(lambda d: len(d.find_elements(By.TAG_NAME, "article")) > current_count)
                time.sleep(1)
            except Exception as e:
                print(f"Could not load more reviews: {str(e)}")
                break

        # Save the data
        if all_reviews:
            df = pd.DataFrame(all_reviews)
            df.to_csv('barbie_imdb_reviews.csv', index=False)
            print(f"\nSuccessfully scraped {len(all_reviews)} reviews.")
            print("Data saved to 'barbie_imdb_reviews.csv'.")

    except Exception as e:
        print(f"Error during scraping: {str(e)}")
        if all_reviews:
            df = pd.DataFrame(all_reviews)
            df.to_csv('barbie_imdb_reviews_partial.csv', index=False)
            print("Saved partial results to 'barbie_imdb_reviews_partial.csv'.")

    finally:
        driver.quit()

# Usage
if __name__ == "__main__":
    url = "https://www.imdb.com/title/tt1517268/reviews/"
    scrape_imdb_reviews(url, max_reviews=1000)

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [Connecting to archive.ubuntu.com (185.125.190.83)] [1 InRelease 14.2 kB/129 kB 11%] [Connecting                                                                                                     Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
0% [Waiting for headers] [1 InRelease 129 kB/129 kB 100%] [Connecting to cloud.r-project.org] [Waiti0% [Waiting for headers] [Connecting to cloud.r-project.org] [Waiting for headers] [Connected to ppa                                                                                                    Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:5 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 htt

# Question 2 (15 points)

Write a python program to **clean the text data** you collected in the previous question and save the clean data in a new column in the csv file. The data cleaning steps include: [Code and output is required for each part]

(1) Remove noise, such as special characters and punctuations.

(2) Remove numbers.

(3) Remove stopwords by using the stopwords list.

(4) Lowercase all texts

(5) Stemming.

(6) Lemmatization.

In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download required NLTK data (only needs to be done once)
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('wordnet')
except LookupError:
    nltk.download('wordnet')

def clean_text(text):
    # Check if the input is a string, if not return an empty string
    if not isinstance(text, str):
        return ''

    # 1. Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # 2. Remove user @ references and '#' from hashtags
    text = re.sub(r'\@\w+|\#','', text)

    # 3. Remove noise, special characters, and punctuation, but keep apostrophes
    text = re.sub(r"[^a-zA-Z\s']", '', text)

    # 4. Remove numbers
    text = re.sub(r'\d+', '', text)

    # 5. Lowercasing
    text = text.lower()

    # 6. Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])

    # 7. Lemmatization
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    # 8. Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # 9. Title case (capitalize first letter of each word)
    text = text.title()

    return text

# Load the data
df = pd.read_csv('barbie_imdb_reviews.csv')

# Handle missing values in 'title' column by filling with an empty string
df['title'] = df['title'].fillna('')

# Apply the cleaning function to the 'title' column and create a new column 'cleaned_title'
df['cleaned_title'] = df['title'].apply(clean_text)

# Save the updated DataFrame to a new CSV file
df.to_csv('barbie_imdb_reviews_cleaned.csv', index=False)

print(df[['title', 'cleaned_title']].head(12))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


                                                title  \
0                                    Too heavy handed   
1                      Beautiful film, but so preachy   
2   Amazing Cast & Set, but the political message ...   
3                                     A Hot Pink Mess   
4                         Boring, mind-numbing drivel   
5                        People are missing the point   
6                                   It was depressing   
7     Could Have Been Great. 2nd Half Brings It Down.   
8   The marketing was more entertaining than the a...   
9                          Somewhat of a Jumbled Mess   
10                      Strong Start... and That's It   
11                                          OVERHYPED   

                                cleaned_title  
0                                Heavy Handed  
1                      Beautiful Film Preachy  
2   Amazing Cast Set Political Message Strong  
3                               Hot Pink Mess  
4                 

# Question 3 (15 points)

Write a python program to **conduct syntax and structure analysis of the clean text** you just saved above. The syntax and structure analysis includes:

(1) **Parts of Speech (POS) Tagging:** Tag Parts of Speech of each word in the text, and calculate the total number of N(oun), V(erb), Adj(ective), Adv(erb), respectively.

(2) **Constituency Parsing and Dependency Parsing:** print out the constituency parsing trees and dependency parsing trees of all the sentences. Using one sentence as an example to explain your understanding about the constituency parsing tree and dependency parsing tree.

(3) **Named Entity Recognition:** Extract all the entities such as person names, organizations, locations, product names, and date from the clean texts, calculate the count of each entity.
```python


In [2]:
import pandas as pd
import nltk
from nltk import pos_tag, word_tokenize
from nltk.tree import Tree
import spacy
import subprocess  # Import the subprocess module

# Load the spacy model
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading en_core_web_sm model...")
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")


def syntax_analysis(df, text_column='cleaned_title'):
    """
    Performs syntax and structure analysis on the text data.

    Args:
      df (pd.DataFrame): DataFrame containing the text data.
      text_column (str): The name of the column containing the text data.

    Returns:
      pd.DataFrame: The input DataFrame with added POS tags, constituency parsing, dependency parsing and named entities.
    """

    # 1. Parts of Speech (POS) Tagging
    def get_pos_counts(text):
        """ Tags parts of speech and calculates counts for Noun, Verb, Adjective, Adverb. """
        try:
            tokens = word_tokenize(text)
            tagged = pos_tag(tokens)
            counts = {'Noun': 0, 'Verb': 0, 'Adjective': 0, 'Adverb': 0}
            for word, tag in tagged:
                if tag.startswith('N'):
                    counts['Noun'] += 1
                elif tag.startswith('V'):
                    counts['Verb'] += 1
                elif tag.startswith('J'):
                    counts['Adjective'] += 1
                elif tag.startswith('R'):
                    counts['Adverb'] += 1
            return counts
        except Exception as e:
            print(f"POS Tagging Error: {e}")
            return {'Noun': 0, 'Verb': 0, 'Adjective': 0, 'Adverb': 0}  # Return zeros on error


    df['pos_counts'] = df[text_column].apply(get_pos_counts)

    # 2. Constituency Parsing and Dependency Parsing using SpaCy
    def get_parse_trees(text):
        """ Generates constituency and dependency parse trees using SpaCy. """
        try:
            doc = nlp(text)

            # Dependency Parsing Tree
            dep_tree = [(token.text, token.dep_, token.head.text) for token in doc]

            # Constituency Parsing (using SpaCy's sentence structure)
            constituency_tree = []
            for chunk in doc.noun_chunks:
                constituency_tree.append((chunk.text, chunk.root.dep_, chunk.root.head.text))

            return constituency_tree, dep_tree
        except Exception as e:
            print(f"Parsing Error: {e}")
            return [], []


    df[['constituency_tree', 'dependency_tree']] = df[text_column].apply(lambda x: pd.Series(get_parse_trees(x)))

    # 3. Named Entity Recognition
    def get_named_entities(text):
        """ Extracts named entities and counts occurrences. """
        try:
            doc = nlp(text)
            entities = {}
            for ent in doc.ents:
                if ent.label_ in entities:
                    entities[ent.label_] += 1
                else:
                    entities[ent.label_] = 1
            return entities
        except Exception as e:
            print(f"NER Error: {e}")
            return {}

    df['named_entities'] = df[text_column].apply(get_named_entities)
    return df


# Load the cleaned data
cleaned_df = pd.read_csv('barbie_imdb_reviews_cleaned.csv')

# Perform syntax analysis
analyzed_df = syntax_analysis(cleaned_df)

# Display the results for the first row

print("Example Row Analysis:")
print("----------------------")
print("Original Title:", analyzed_df['title'][0])
print("Cleaned Title:", analyzed_df['cleaned_title'][0])
print("POS Counts:", analyzed_df['pos_counts'][0])
print("Constituency Tree:", analyzed_df['constituency_tree'][0])
print("Dependency Tree:", analyzed_df['dependency_tree'][0])
print("Named Entities:", analyzed_df['named_entities'][0])

# Example Explanation (using the first sentence)
print("\nExample Explanation:")
print("--------------------")
example_sentence = analyzed_df['cleaned_title'][0]
print("Example Sentence:", example_sentence)

print("\nDependency Parsing Example:")
print("The dependency parsing tree represents the relationships between words in the sentence.  Each word is connected to another word (its head) by a directed edge, representing the type of dependency.")
example_dep_tree = analyzed_df['dependency_tree'][0]
print(example_dep_tree)
print("""
    For example, ('Heavy', 'amod', 'Handed') means that the word 'Heavy' is an adjectival modifier (amod) of the word 'Handed'.
""")

print("\nConstituency Parsing Example:")
print("The constituency parsing tree divides the sentence into constituents (phrases).")
example_const_tree = analyzed_df['constituency_tree'][0]
print(example_const_tree)
print("""
    Here, each tuple shows the phrases and dependencies of the sentences.
""")

# Save the analyzed DataFrame to a new CSV file
analyzed_df.to_csv('barbie_imdb_reviews_analyzed.csv', index=False)
print("\nAnalysis saved to 'barbie_imdb_reviews_analyzed.csv'")



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************

POS Tagging Error: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************

POS Tagging Error: 
*****

# **Following Questions must answer using AI assitance**

#Question 4 (20 points).

Q4. (PART-1)
Web scraping data from the GitHub Marketplace to gather details about popular actions. Using Python, the process begins by sending HTTP requests to multiple pages of the marketplace (1000 products), handling pagination through dynamic page numbers. The key details extracted include the product name, a short description, and the URL.

 The extracted data is stored in a structured CSV format with columns for product name, description, URL, and page number. A time delay is introduced between requests to avoid server overload. ChatGPT can assist by helping with the parsing of HTML, error handling, and generating reports based on the data collected.

 The goal is to complete the scraping within a specified time limit, ensuring that the process is efficient and adheres to GitHub’s usage guidelines.

(PART -2)

1.   **Preprocess Data**: Clean the text by tokenizing, removing stopwords, and converting to lowercase.

2. Perform **Data Quality** operations.


Preprocessing:
Preprocessing involves cleaning the text by removing noise such as special characters, HTML tags, and unnecessary whitespace. It also includes tasks like tokenization, stopword removal, and lemmatization to standardize the text for analysis.

Data Quality:
Data quality checks ensure completeness, consistency, and accuracy by verifying that all required columns are filled and formatted correctly. Additionally, it involves identifying and removing duplicates, handling missing values, and ensuring the data reflects the true content accurately.


In [3]:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from nltk.corpus import stopwords

import random
# Base URL of GitHub Marketplace (Actions section)
BASE_URL = "https://github.com/marketplace?type=actions&page="

# Headers to mimic a browser request.  Added Accept-Encoding
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Referer": "https://www.google.com/",
    "Accept-Encoding": "gzip, deflate, br",  # Critical:  Tell the server we accept compressed responses
}

# Initialize list to store data
data = []
page = 0
max_retries = 3
total_scraped = 0
max_products = 1000

# Scrape multiple pages (Assuming 40 products per page, we need ~25 pages for 1000 products)
while total_scraped < max_products:
    page += 1
    url = f"{BASE_URL}{page}"
    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
            break  # If the request was successful, break out of the retry loop
        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch page {page} (attempt {attempt + 1}/{max_retries}): {e}")
            if attempt < max_retries - 1:
                time.sleep(5 * (attempt + 1))  # Exponential backoff
            else:
                print(f"Failed to retrieve page {page} after {max_retries} attempts.")
                continue # Skip to the next page if all retries fail

    soup = BeautifulSoup(response.text, "html.parser")

    # Find all product containers
    items = soup.find_all('div', {'data-testid': 'non-featured-item'})

    num_items_scraped = 0
    for item in items:
        try:
            name = item.find('a', class_='marketplace-common-module__marketplace-item-link--jrIHf').text.strip()
            product_url = "https://github.com" + item.find('a')['href']
            description = item.find('p', class_='text-small').text.strip()
            data.append([name, description, product_url, page])
            num_items_scraped+=1
        except AttributeError as e:
            print(f"AttributeError parsing item on page {page}: {e}")
        except Exception as e:
            print(f"Unexpected error parsing item on page {page}: {e}")

    total_scraped += num_items_scraped

    if num_items_scraped == 0:
        print(f'Page {page} contain no more actions. Stopping...')
        break # There is not more actions in the github market place

    print(f"Scraped {num_items_scraped} products from page {page}, Total {total_scraped} actions")
    time.sleep(random.uniform(3, 7))  # Be very polite

# Save to CSV
df = pd.DataFrame(data, columns=["Product Name", "Description", "URL", "Page Number"])

df.to_csv("github_marketplace_actions.csv", index=False)
print("Scraping completed! Data saved to github_marketplace_actions.csv")



Scraped 20 products from page 1, Total 20 actions
Scraped 20 products from page 2, Total 40 actions
Scraped 20 products from page 3, Total 60 actions
Scraped 20 products from page 4, Total 80 actions
Scraped 20 products from page 5, Total 100 actions
Scraped 20 products from page 6, Total 120 actions
Scraped 20 products from page 7, Total 140 actions
Page 8 contain no more actions. Stopping...
Scraping completed! Data saved to github_marketplace_actions.csv


Github MarketPlace page:
https://github.com/marketplace?type=actions

In [None]:
# Add these imports at the top
from nltk.stem import WordNetLemmatizer
import re
import pandas as pd
from tqdm import tqdm

# Initialize NLP resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """
    Comprehensive text preprocessing pipeline with error handling
    """
    try:
        if pd.isna(text):
            return ""

        # Clean text
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
        text = re.sub(r'\W+', ' ', text)  # Remove special characters
        text = re.sub(r'\d+', '', text)  # Remove numbers
        text = text.lower().strip()

        # Tokenization and lemmatization
        tokens = word_tokenize(text)
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

        return ' '.join(tokens)

    except Exception as e:
        print(f"Error processing text: {e}")
        return ""

def ensure_data_quality(df):
    """
    Comprehensive data quality checks and cleaning
    """
    # 1. Handle missing values
    print("\nData Quality Report:")
    print("Initial shape:", df.shape)

    # Critical columns check
    critical_cols = ['Product Name', 'URL']
    df = df.dropna(subset=critical_cols, how='any')

    # 2. Remove duplicates
    dup_count = df.duplicated(subset=['URL']).sum()
    print(f"Removing {dup_count} duplicate entries")
    df = df.drop_duplicates(subset=['URL'], keep='first')

    # 3. Validate URLs
    url_pattern = r'^https?://github\.com/.*'
    valid_urls = df['URL'].str.contains(url_pattern, na=False)
    print(f"Found {len(df) - valid_urls.sum()} invalid URLs")
    df = df[valid_urls]

    # 4. Clean text columns
    print("Processing text columns...")
    tqdm.pandas(desc="Cleaning Descriptions")
    df['Cleaned Description'] = df['Description'].progress_apply(preprocess_text)

    # 5. Final check
    print("\nFinal Data Quality Check:")
    print("Missing values per column:")
    print(df.isna().sum())
    print("\nData types:")
    print(df.dtypes)
    print("\nFinal shape:", df.shape)

    return df

# Add these lines AFTER creating the initial DataFrame but BEFORE saving to CSV
print("\nStarting data quality checks and preprocessing...")

# Run data quality pipeline
df_clean = ensure_data_quality(df)

# Save cleaned data
df_clean.to_csv("github_marketplace_actions_cleaned.csv", index=False)
print("\nCleaned data saved to github_marketplace_actions_cleaned.csv")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ashwiksagi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ashwiksagi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ashwiksagi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



Starting data quality checks and preprocessing...

Data Quality Report:
Initial shape: (140, 4)
Removing 0 duplicate entries
Found 0 invalid URLs
Processing text columns...


Cleaning Descriptions: 100%|██████████| 140/140 [00:00<00:00, 4964.55it/s]

Error processing text: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/ashwiksagi/nltk_data'
    - '/Users/ashwiksagi/anaconda3/envs/asu_bot_env/nltk_data'
    - '/Users/ashwiksagi/anaconda3/envs/asu_bot_env/share/nltk_data'
    - '/Users/ashwiksagi/anaconda3/envs/asu_bot_env/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************

Error processing text: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtai




#Question 5 (20 points)

PART 1:
Web Scrape  tweets from Twitter using the Tweepy API, specifically targeting hashtags related to subtopics (machine learning or artificial intelligence.)
The extracted data includes the tweet ID, username, and text.

Part 2:
Perform data cleaning procedures

A final data quality check ensures the completeness and consistency of the dataset. The cleaned data is then saved into a CSV file for further analysis.


**Note**

1.   Follow tutorials provided in canvas to obtain api keys. Use ChatGPT to get the code. Make sure the file is downloaded and saved.
2.   Make sure you divide GPT code as shown in tutorials, dont make multiple requestes.


In [None]:
import tweepy
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Twitter API credentials (replace with your own keys)
api_key = '
api_secret_key = ''
access_token = '4037298725'
access_token_secret = ''

# Authenticate to Twitter
auth = tweepy.OAuthHandler(api_key, api_secret_key)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)

# Function to clean tweet text
def clean_tweet_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Remove user mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Lowercase the text
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    text = ' '.join([word for word in tokens if word not in stop_words])
    return text

# Function to scrape tweets
def scrape_tweets(hashtag, max_tweets=100):
    tweets_data = []
    for tweet in tweepy.Cursor(api.search_tweets, q=hashtag, lang="en", tweet_mode='extended').items(max_tweets):
        tweet_id = tweet.id_str
        username = tweet.user.screen_name
        text = tweet.full_text
        cleaned_text = clean_tweet_text(text)
        tweets_data.append([tweet_id, username, text, cleaned_text])
    return tweets_data

# Scrape tweets with hashtags #machinelearning and #artificialintelligence
tweets_ml = scrape_tweets('#machinelearning', max_tweets=25)
tweets_ai = scrape_tweets('#artificialintelligence', max_tweets=20)

# Combine the data
tweets_data = tweets_ml + tweets_ai

# Create a DataFrame
df = pd.DataFrame(tweets_data, columns=['Tweet ID', 'Username', 'Text', 'Cleaned Text'])

# Save to CSV
df.to_csv('tweets_data.csv', index=False)
print("Scraping and cleaning completed! Data saved to tweets_data.csv")

Forbidden: 403 Forbidden
453 - You currently have access to a subset of X API V2 endpoints and limited v1.1 endpoints (e.g. media post, oauth) only. If you need access to this endpoint, you may need a different access level. You can learn more here: https://developer.x.com/en/portal/product

In [None]:
import tweepy
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import time
import random
import nltk

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Twitter API v2 configuration
client = tweepy.Client(bearer_token)

def clean_tweet_text(text):
    """Enhanced text cleaning with lemmatization"""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower().strip()

    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    return ' '.join(tokens)

def handle_rate_limits(response):
    """Handle rate limits using Twitter API headers"""
    if response.meta:
        remaining = int(response.meta.get('remaining', 1))
        reset_time = int(response.meta.get('reset', time.time() + 900))

        if remaining < 2:
            sleep_duration = reset_time - time.time() + random.uniform(1, 5)
            if sleep_duration > 0:
                print(f"Rate limit approaching. Sleeping {sleep_duration:.1f} seconds")
                time.sleep(max(sleep_duration, 15))

def scrape_tweets(query, max_tweets=100):
    """Improved tweet scraper with pagination and rate limit handling"""
    tweets_data = []
    next_token = None
    retries = 3

    while max_tweets > 0 and retries > 0:
        try:
            response = client.search_recent_tweets(
                query=query,
                tweet_fields=["id", "text", "author_id", "created_at"],
                expansions="author_id",
                user_fields=["username"],
                max_results=min(100, max_tweets),
                next_token=next_token
            )

            if not response.data:
                break

            # Process users
            user_map = {}
            if response.includes and 'users' in response.includes:
                user_map = {user.id: user.username for user in response.includes['users']}

            # Process tweets
            for tweet in response.data:
                tweets_data.append([
                    tweet.id,
                    user_map.get(tweet.author_id, "unknown"),
                    tweet.text,
                    clean_tweet_text(tweet.text),
                    tweet.created_at
                ])
                max_tweets -= 1

            # Update pagination token
            next_token = response.meta.get('next_token', None)
            if not next_token or max_tweets <= 0:
                break

            # Handle rate limits
            handle_rate_limits(response)

        except tweepy.TooManyRequests as e:
            print(f"Rate limited: {e}")
            reset_time = int(e.response.headers['x-rate-limit-reset'])
            sleep_time = reset_time - time.time() + random.uniform(5, 15)
            print(f"Sleeping {sleep_time:.1f} seconds")
            time.sleep(max(sleep_time, 15))
            retries -= 1

        except Exception as e:
            print(f"Error: {e}")
            break

    return tweets_data

# Main execution
if __name__ == "__main__":
    # Combined search query (more efficient than separate calls)
    search_query = "(#machinelearning OR #artificialintelligence) -is:retweet lang:en"

    # Scrape tweets with enhanced parameters
    tweets = scrape_tweets(search_query, max_tweets=150)

    # Create DataFrame
    df = pd.DataFrame(tweets, columns=[
        'Tweet ID', 'Username', 'Text', 'Cleaned Text', 'Created At'
    ])

    # Save results
    df.to_csv('ai_tweets.csv', index=False)
    print(f"Successfully collected {len(df)} tweets. Data saved to ai_tweets.csv")


Rate limited: 429 Too Many Requests
Too Many Requests
Sleeping 340.2 seconds


KeyboardInterrupt: 

In [None]:
# twitter_scraper.py
import os
import tweepy
import pandas as pd
import re
import time
import random
import nltk
import logging
from dotenv import load_dotenv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from diskcache import Cache

# Load environment variables
load_dotenv()

# Initialize caching
cache = Cache('twitter_cache')
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

# Configuration (Set these in .env file)
MAX_TWEETS = 150
SEARCH_QUERY = "(#machinelearning OR #artificialintelligence) -is:retweet lang:en"

# Validate credentials
if not BEARER_TOKEN:
    raise ValueError("Missing Twitter Bearer Token in environment variables")

# Initialize Twitter client
client = tweepy.Client(
    bearer_token=BEARER_TOKEN,
    wait_on_rate_limit=True,
    wait_on_rate_limit_notify=True
)

lemmatizer = WordNetLemmatizer()
logging.basicConfig(filename='twitter_scraper.log', level=logging.INFO)

@cache.memoize(expire=3600)
def clean_text(text):
    """Secure text processing pipeline"""
    try:
        text = re.sub(r'http\S+|@\w+|#\w+|[^a-zA-Z\s]', '', text)
        tokens = [lemmatizer.lemmatize(w) for w in word_tokenize(text.lower().strip())]
        return ' '.join([w for w in tokens if w not in stopwords.words('english')])
    except Exception as e:
        logging.error(f"Text cleaning error: {e}")
        return ""

def handle_rate_limits(response):
    """Precision rate limit handler"""
    if response.headers:
        headers = response.headers
        remaining = int(headers.get('x-rate-limit-remaining', 1))
        reset_time = int(headers.get('x-rate-limit-reset', time.time() + 900))

        if remaining <= 1:
            wait_time = max(reset_time - time.time() + 5, 15)
            logging.warning(f"Rate limit reached. Sleeping {wait_time:.1f} seconds")
            time.sleep(wait_time)
            return True
    return False

def validate_tweet(tweet):
    """Data validation layer"""
    required_fields = {'id', 'text', 'author_id', 'created_at'}
    return all(hasattr(tweet, field) for field in required_fields)

def scrape_tweets(query, max_tweets):
    """Enterprise-grade scraper with security controls"""
    tweets = []
    next_token = None

    try:
        while max_tweets > 0:
            response = client.search_recent_tweets(
                query=query,
                tweet_fields=["id", "text", "author_id", "created_at"],
                expansions="author_id",
                user_fields=["username"],
                max_results=min(100, max_tweets),
                next_token=next_token
            )

            if not response.data:
                break

            # Process users securely
            users = {}
            if response.includes and 'users' in response.includes:
                users = {u.id: u.username for u in response.includes['users']}

            # Validate and process tweets
            for tweet in response.data:
                if validate_tweet(tweet):
                    tweets.append([
                        tweet.id,
                        users.get(tweet.author_id, "unknown"),
                        tweet.text,
                        clean_text(tweet.text),
                        tweet.created_at
                    ])
                    max_tweets -= 1

            next_token = response.meta.get('next_token')
            if not next_token or handle_rate_limits(response):
                break

    except tweepy.TweepyException as e:
        logging.error(f"API Error: {e}")
        time.sleep(random.uniform(30, 60))

    except Exception as e:
        logging.critical(f"Unexpected error: {e}")

    return tweets

if __name__ == "__main__":
    try:
        data = scrape_tweets(SEARCH_QUERY, MAX_TWEETS)
        df = pd.DataFrame(data, columns=[
            'Tweet ID', 'Username', 'Text', 'Cleaned Text', 'Created At'
        ])
        df.to_csv('ai_tweets.csv', index=False)
        print(f"Successfully collected {len(df)} tweets")

    except Exception as e:
        logging.critical(f"Fatal error: {e}")
        print("Scraping failed - check logs for details")


TypeError: __init__() got an unexpected keyword argument 'wait_on_rate_limit_notify'

# Mandatory Question

Provide your thoughts on the assignment. What did you find challenging, and what aspects did you enjoy? Your opinion on the provided time to complete the assignment.

# Write your response below
Fill out survey and provide your valuable feedback.

https://docs.google.com/forms/d/e/1FAIpQLSd_ObuA3iNoL7Az_C-2NOfHodfKCfDzHZtGRfIker6WyZqTtA/viewform?usp=dialog