# Necessary Steps

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd

In [3]:
cd /content/drive/MyDrive/MediaCloud

/content/drive/MyDrive/MediaCloud


In [4]:
pwd = os.getcwd()
pwd

'/content/drive/MyDrive/MediaCloud'

# Equal Division of Newspapers Based on Years

In [None]:
mediacloud_data = pd.read_csv(os.path.join(pwd, "Data/Annual_Budget_Indian_National_2014-2024.csv"))

In [None]:
articles = 150 # Number of articles you want to download for each year

In [None]:
# Convert the 'publish_date' column to datetime
mediacloud_data['publish_date'] = pd.to_datetime(mediacloud_data['publish_date'])

# Extract the year from the 'publish_date' column
mediacloud_data['year'] = mediacloud_data['publish_date'].dt.year


output_folder = os.path.join(pwd, "Data/YearWise_Data")
os.makedirs(output_folder, exist_ok=True)

# Loop through each unique year and process the data
for year in mediacloud_data['year'].unique():
    # Filter the dataframe for the current year
    data_year = mediacloud_data[mediacloud_data['year'] == year]

    # Randomly sample 150 rows if there are more than 150 articles, otherwise take all
    data_sampled = data_year.sample(n=articles, random_state=42) if len(data_year) > 150 else data_year

    # Save the sampled dataframe to a CSV file
    output_path = os.path.join(output_folder, f'data_{year}.csv')
    data_sampled.to_csv(output_path, index=False)

    print(f"Saved {len(data_sampled)} articles for the year {year} to {output_path}")

Saved 150 articles for the year 2021 to /content/drive/MyDrive/MediaCloud/Data/YearWise_Data/data_2021.csv
Saved 150 articles for the year 2019 to /content/drive/MyDrive/MediaCloud/Data/YearWise_Data/data_2019.csv
Saved 150 articles for the year 2020 to /content/drive/MyDrive/MediaCloud/Data/YearWise_Data/data_2020.csv
Saved 100 articles for the year 2017 to /content/drive/MyDrive/MediaCloud/Data/YearWise_Data/data_2017.csv
Saved 104 articles for the year 2018 to /content/drive/MyDrive/MediaCloud/Data/YearWise_Data/data_2018.csv
Saved 150 articles for the year 2016 to /content/drive/MyDrive/MediaCloud/Data/YearWise_Data/data_2016.csv
Saved 83 articles for the year 2014 to /content/drive/MyDrive/MediaCloud/Data/YearWise_Data/data_2014.csv
Saved 27 articles for the year 2012 to /content/drive/MyDrive/MediaCloud/Data/YearWise_Data/data_2012.csv
Saved 150 articles for the year 2015 to /content/drive/MyDrive/MediaCloud/Data/YearWise_Data/data_2015.csv
Saved 54 articles for the year 2013 to 



> **As of now I have manually kept only data of years 2020, 2021, 2022, 2023, and 2024**



# Parse the data for each individual year

In [5]:
!pip install newspaper3k

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.1.2-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspaper3k)
  Downloading tinysegmenter-0.3.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Co

In [6]:
import os
import time
import pandas as pd
from newspaper import Article
import nltk
from requests.exceptions import HTTPError, ConnectionError
import urllib3

In [7]:
pwd

'/content/drive/MyDrive/MediaCloud'

In [8]:
nltk.download('punkt')

source_folder = os.path.join(pwd, 'Data/YearWise_Data')

destination_folder = os.path.join(pwd, 'Data/Parsed_YearWise_Data')
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [9]:
# Function to process a single CSV file
def process_csv_file(file_path, destination_folder):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Add empty columns for parsed data
    df['parsed_title'] = ''
    df['parsed_authors'] = ''
    df['parsed_publish_date'] = ''
    df['parsed_text'] = ''
    df['parsed_category'] = ''
    df['parsed_summary'] = ''
    df['parsed_keywords'] = ''

    # Record the start time
    start_time = time.time()

    # Loop through each URL in the CSV
    for index, row in df.iterrows():
        url = row['url']  # Assuming the column containing URLs is 'url'

        try:
            # Validate URL format
            if not url.startswith('http'):
                raise ValueError(f"Invalid URL: {url}")

            # Download and parse the article
            article = Article(url, language="en")
            article.download()
            article.parse()

            # Store the parsed data in the respective columns
            df.at[index, 'parsed_title'] = article.title
            df.at[index, 'parsed_authors'] = ', '.join(article.authors)
            df.at[index, 'parsed_publish_date'] = article.publish_date
            df.at[index, 'parsed_text'] = article.text

            # Perform NLP operations
            article.nlp()
            df.at[index, 'parsed_keywords'] = ', '.join(article.keywords)
            df.at[index, 'parsed_summary'] = article.summary

            # Extract category from the URL
            category_parts = url.split('/')
            if len(category_parts) > 3:  # Assuming category is in the fourth segment
                df.at[index, 'parsed_category'] = category_parts[3]
            else:
                df.at[index, 'parsed_category'] = None

        except HTTPError as http_err:
            print(f"HTTP error occurred: {http_err} for URL: {url}")
        except ConnectionError as conn_err:
            print(f"Connection error occurred: {conn_err} for URL: {url}")
        except urllib3.exceptions.MaxRetryError as retry_err:
            print(f"Max retries exceeded for URL: {url}")
        except ValueError as val_err:
            print(f"Value error: {val_err}")
        except Exception as e:
            print(f"Failed to process URL: {url} with error: {e}")

        # Adding a small delay to avoid overwhelming the server with requests
        time.sleep(1)

    # Record the end time
    end_time = time.time()

    # Calculate the total time taken
    execution_time = end_time - start_time
    print(f"Total time taken to process {file_path}: {execution_time} seconds")

    # Get the file name from the original path
    file_name = os.path.basename(file_path)

    # Create the destination file name (e.g., Parsed_2020.csv)
    destination_file_path = os.path.join(destination_folder, f"Parsed_{file_name}")

    # Save the updated DataFrame to the destination folder
    df.to_csv(destination_file_path, index=False)

# Loop through each CSV file in the source folder
for file_name in os.listdir(source_folder):
    # Process only CSV files
    if file_name.endswith('.csv'):
        file_path = os.path.join(source_folder, file_name)
        print(f"Processing {file_path}...")

        # Call the function to process each CSV
        process_csv_file(file_path, destination_folder)

print("All files processed successfully!")


Processing /content/drive/MyDrive/MediaCloud/Data/YearWise_Data/data_2021.csv...
Failed to process URL: http://www.indiasnews.net/news/267672727/16-opposition-parties-to-boycott-presidential-address with error: You must `download()` an article first!
Failed to process URL: http://www.indiasnews.net/news/272063928/states-demand-gst-compensation-for-another-5-years with error: You must `download()` an article first!
Failed to process URL: http://www.businessworld.in/article/Healthcare-attains-foremost-priority-for-the-first-time-in-Union-Budget-says-Gandharv-Roy-Medica-Group-of-Hospitals/09-02-2021-375426 with error: You must `download()` an article first!
Failed to process URL: https://www.businessworld.in/article/Impact-Of-Government-Policies-On-The-Auto-Mobile-Industry/26-03-2021-384969 with error: You must `download()` an article first!
Failed to process URL: http://www.businessworld.in/article/Indian-Shares-Rise-Ahead-Of-Budget/01-02-2021-372198 with error: You must `download()` an 

# Remove Those Articles Where Text Was Not Parsed Somehow

In [10]:
pwd

'/content/drive/MyDrive/MediaCloud'

In [11]:
parsed_data_folder = os.path.join(pwd, 'Data/Parsed_YearWise_Data')

In [12]:
def clean_csv_file(file_path):

    dff = pd.read_csv(file_path)
    dff_cleaned = dff.dropna(subset=['parsed_text'])
    dff_cleaned.to_csv(file_path, index=False)


for file_name in os.listdir(parsed_data_folder):
    # Process only CSV files
    if file_name.endswith('.csv'):
        file_path = os.path.join(parsed_data_folder, file_name)
        print(f"Cleaning {file_path}...")

        # Call the function to clean each CSV
        clean_csv_file(file_path)

print("All files cleaned successfully!")

Cleaning /content/drive/MyDrive/MediaCloud/Data/Parsed_YearWise_Data/Parsed_data_2021.csv...
Cleaning /content/drive/MyDrive/MediaCloud/Data/Parsed_YearWise_Data/Parsed_data_2020.csv...
Cleaning /content/drive/MyDrive/MediaCloud/Data/Parsed_YearWise_Data/Parsed_data_2022.csv...
Cleaning /content/drive/MyDrive/MediaCloud/Data/Parsed_YearWise_Data/Parsed_data_2023.csv...
Cleaning /content/drive/MyDrive/MediaCloud/Data/Parsed_YearWise_Data/Parsed_data_2024.csv...
All files cleaned successfully!
