In [None]:
import pandas as pd
import os
import re
import spacy
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

import boto3
from io import StringIO
from datetime import datetime
from dotenv import load_dotenv


import boto3
import pandas as pd
from io import StringIO


#-------------------------------Changes required-------------------------------------------------------------------------------------------------------------------------

def read_and_combine_csv_s3(bucket_name, folder_path, prefix_filter="moneycontrol_news_"):
    """
    Reads CSV files from a specified S3 bucket folder, sorts them name-wise in descending order,
    processes the first file as daily data, and combines the first 7 files as weekly data.

    Parameters:
    bucket_name (str): The S3 bucket name.
    folder_path (str): The folder path in the S3 bucket containing the CSV files.
    prefix_filter (str): The prefix to filter the CSV files.

    Returns:
    tuple: A tuple containing:
        - daily_df (pd.DataFrame): DataFrame of the first file (daily data).
        - weekly_df (pd.DataFrame): DataFrame combining the first 7 files (weekly data).
    """
    # Initialize S3 client
    s3 = boto3.client('s3')
    
    # Get the list of objects in the folder
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_path)
    
    if 'Contents' not in response:
        raise FileNotFoundError("No files found in the specified folder or bucket.")
    
    # Extract and sort keys in descending order
    object_keys = sorted(
        [obj['Key'] for obj in response['Contents'] if obj['Key'].startswith(f"{folder_path}{prefix_filter}") and obj['Key'].endswith('.csv')],
        reverse=True
    )
    
    if not object_keys:
        raise FileNotFoundError(f"No files matching the prefix '{prefix_filter}' found in the specified folder.")
    
    # Read the first file as daily_df
    daily_file_key = object_keys[0]
    daily_obj = s3.get_object(Bucket=bucket_name, Key=daily_file_key)
    daily_df = pd.read_csv(StringIO(daily_obj['Body'].read().decode('utf-8')))
    
    # Read the first 7 files as weekly_df
    weekly_keys = object_keys[:7]
    weekly_df = pd.concat(
        [
            pd.read_csv(StringIO(s3.get_object(Bucket=bucket_name, Key=key)['Body'].read().decode('utf-8')))
            for key in weekly_keys
        ],
        ignore_index=True
    )
    
    return daily_df, weekly_df

# Example usage:
# daily_df, weekly_df = read_and_combine_csv_s3(bucket_name="your_bucket_name", folder_path="your_folder_path/")



def read_and_combine_csv_n():
    """
    Reads CSV files from a folder, sorts them by name, and processes the first file as daily data 
    and the first 7 files as weekly data.

    Returns:
    tuple: A tuple containing:
        - daily_df (pd.DataFrame): DataFrame of the first file (daily data).
        - weekly_df (pd.DataFrame): DataFrame combining the first 7 files (weekly data).
    """
    folder_path = r"D:\NLP_Proj\bucket01"
    
    # List and sort all CSV files in ascending order
    all_files = sorted([f for f in os.listdir(folder_path) if f.endswith('.csv')], reverse=True)
    
    if not all_files:
        raise FileNotFoundError("No CSV files found in the specified folder.")
    
    # Read the first file as daily data
    daily_file_path = os.path.join(folder_path, all_files[0])
    daily_df = pd.read_csv(daily_file_path)

    # Read the first 7 files as weekly data
    weekly_files = all_files[:7]
    weekly_df = pd.concat(
        [pd.read_csv(os.path.join(folder_path, file)) for file in weekly_files],
        ignore_index=True
    )
    
    return daily_df, weekly_df

def read_and_combine_csv_comments():
    """
    Reads CSV files from a folder, sorts them by name, and processes the first file as daily data 
    and the first 7 files as weekly data.

    Returns:
    tuple: A tuple containing:
        - daily_df (pd.DataFrame): DataFrame of the first file (daily data).
        - weekly_df (pd.DataFrame): DataFrame combining the first 7 files (weekly data).
    """
    folder_path = r"D:\NLP_Proj\bucket02"
    
    # List and sort all CSV files in ascending order
    all_files = sorted([f for f in os.listdir(folder_path) if f.endswith('.csv')], reverse=True)
    
    if not all_files:
        raise FileNotFoundError("No CSV files found in the specified folder.")
    
    # Read the first file as daily data
    daily_file_path = os.path.join(folder_path, all_files[0])
    daily_df = pd.read_csv(daily_file_path)

    # Read the first 7 files as weekly data
    weekly_files = all_files[:7]
    weekly_df = pd.concat(
        [pd.read_csv(os.path.join(folder_path, file)) for file in weekly_files],
        ignore_index=True
    )
    
    return daily_df, weekly_df

#------------------------------------------------------------------------------------------No changes------------------------------------------------------------------------------------------------
def handle_duplicates(df):
    """
    Prints the number of duplicate entries in the DataFrame and removes them.

    Parameters:
    df (pd.DataFrame): The DataFrame to check for duplicates.

    Returns:
    pd.DataFrame: A DataFrame with duplicates removed.
    """
    duplicate_count = df.duplicated().sum()
    print(f"Number of duplicate entries: {duplicate_count}")
    
    # Remove duplicates
    df_cleaned = df.drop_duplicates()
    print("Duplicates removed.")
    
    return df_cleaned

def convert_timestamp(df, timestamp_column):
    """
    Converts a timestamp column to a standard datetime format.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the timestamp column.
    timestamp_column (str): The name of the column with timestamp data.

    Returns:
    pd.DataFrame: A DataFrame with the timestamp column converted to datetime.
    """
    df[timestamp_column] = pd.to_datetime(df[timestamp_column], errors='coerce')
    print(f"Timestamps converted to datetime format in column: '{timestamp_column}'")
    return df

def combine_text_columns(df, title_column, description_column, new_column_name="combined_text"):
    """
    Combines the title and description columns into a single text column.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the title and description columns.
    title_column (str): The name of the title column.
    description_column (str): The name of the description column.
    new_column_name (str): The name of the new column to store the combined text.

    Returns:
    pd.DataFrame: A DataFrame with the new combined text column.
    """
    df[new_column_name] = df[title_column].astype(str) + " " + df[description_column].astype(str)
    #print(f"Columns '{title_column}' and '{description_column}' combined into '{new_column_name}'.")
    return df


def handle_missing_values(df):
    """
    Counts missing values in a DataFrame, prints a summary, and removes rows with missing values.

    Parameters:
    df (pd.DataFrame): The DataFrame to check and handle missing values.

    Returns:
    pd.DataFrame: A cleaned DataFrame with rows containing missing values removed.
    """
    # Count missing values per column
    missing_summary = df.isnull().sum()
    total_missing = missing_summary.sum()
    
    print("Summary of Missing Values (Before Cleaning):")
    print(missing_summary)
    print(f"\nTotal missing values: {total_missing}")

    # Remove rows with missing values
    df_cleaned = df.dropna()
    print(f"\nRows with missing values removed. Remaining rows: {len(df_cleaned)}")

    return df_cleaned

def remove_stop_words_from_column(df, column_name):
    """
    Removes stop words from a specified column in a DataFrame using a stop words file.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the text data.
    column_name (str): The name of the column to process.
    stop_words_path (str): The path to the file containing stop words (one per line).

    Returns:
    pd.DataFrame: A DataFrame with the specified column cleaned of stop words.
    """

    stop_words_path = r"D:\NLP_Proj\stop_words.txt"

    # Read the stop words from the file
    with open(stop_words_path, 'r', encoding='utf-8') as file:
        stop_words = set(word.strip() for word in file.readlines())
    
    # Define a function to remove stop words from text
    def remove_stop_words(text):
        text = re.sub(r'\s+', ' ', str(text).strip())  # Ensure clean text input
        return ' '.join(word for word in text.split() if word not in stop_words)
    
    # Check if the column exists in the DataFrame
    if column_name not in df.columns:
        raise ValueError(f"The DataFrame does not contain a column named '{column_name}'.")
    
    # Apply the stop words removal function to the specified column
    df[column_name] = df[column_name].astype(str).apply(remove_stop_words)
    
    return df



def lemmatize_column(df, column_name, language_model="en_core_web_sm"):
    """
    Lemmatizes the text in the specified column of a DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the text data.
    column_name (str): The name of the column to lemmatize.
    language_model (str): The spaCy language model to use (default: "en_core_web_sm").

    Returns:
    pd.DataFrame: A DataFrame with lemmatized text in the specified column.
    """
    # Load the spaCy language model
    nlp = spacy.load(language_model)
    
    # Define a function to lemmatize text
    def lemmatize_text(text):
        doc = nlp(text)
        return ' '.join(token.lemma_ for token in doc)
    
    # Check if the column exists in the DataFrame
    if column_name not in df.columns:
        raise ValueError(f"The DataFrame does not contain a column named '{column_name}'.")
    
    # Apply the lemmatization function to the specified column
    df[column_name] = df[column_name].astype(str).apply(lemmatize_text)
    
    return df

# Example usage:
# df = lemmatize_column(df, 'comment_content')





def clean_comments(df):
    """
    Cleans the comment_content column of a DataFrame by removing URLs, mentions, and hashtags.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the comments to clean.

    Returns:
    pd.DataFrame: A DataFrame with a cleaned comment_content column.
    """
    # Define a cleaning function for text
    def clean_text(text):
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = re.sub(r'@\w+', '', text)    # Remove mentions
        text = re.sub(r'#', '', text)      # Remove hashtag symbols
        text = re.sub(r'[^\w\s\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF]', '', text)  # Remove special characters but retain emojis
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespaces
        return text
    
    # Apply cleaning function to the comment_content column
    if 'comment_content' in df.columns:
        df['comment_content'] = df['comment_content'].astype(str).apply(clean_text)
    else:
        raise ValueError("The DataFrame does not contain a 'comment_content' column.")
    
    return df

def apply_vader_sentiment(df, column_name):
    """
    Applies VADER sentiment analysis to the specified column in a DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the text data.
    column_name (str): The name of the column to analyze.

    Returns:
    pd.DataFrame: A DataFrame with additional columns for sentiment scores, compound score, and sentiment label.
    """
    # Ensure the VADER lexicon is downloaded
    try:
        nltk.data.find('sentiment/vader_lexicon')
    except LookupError:
        nltk.download('vader_lexicon')

    # Initialize the VADER Sentiment Analyzer
    sia = SentimentIntensityAnalyzer()

    # Apply VADER sentiment analysis
    df['sentiment_scores'] = df[column_name].apply(lambda x: sia.polarity_scores(x))
    df['compound_score'] = df['sentiment_scores'].apply(lambda x: x['compound'])
    df['sentiment'] = df['compound_score'].apply(
        lambda x: 'positive' if x > 0.25 else ('negative' if x < -0.25 else 'neutral')
    )

    return df









In [154]:
daily_df, weekly_df = read_and_combine_csv_n()

In [155]:


#daily
cleaned_df = handle_duplicates(daily_df)
cleaned_df = handle_missing_values(cleaned_df)
cleaned_df = convert_timestamp(cleaned_df, "timestamp")
cleaned_df = combine_text_columns(cleaned_df, "title", "description", "text_for_analysis")
cleaned_df = remove_stop_words_from_column(cleaned_df,"text_for_analysis")
cleaned_df = lemmatize_column(cleaned_df,"text_for_analysis")
cleaned_df_daily = apply_vader_sentiment(cleaned_df, 'text_for_analysis')

Number of duplicate entries: 0
Duplicates removed.
Summary of Missing Values (Before Cleaning):
timestamp      0
title          0
url            0
description    0
dtype: int64

Total missing values: 0

Rows with missing values removed. Remaining rows: 25
Timestamps converted to datetime format in column: 'timestamp'


  df[timestamp_column] = pd.to_datetime(df[timestamp_column], errors='coerce')
  df[timestamp_column] = pd.to_datetime(df[timestamp_column], errors='coerce')
  df[timestamp_column] = pd.to_datetime(df[timestamp_column], errors='coerce')
  df[timestamp_column] = pd.to_datetime(df[timestamp_column], errors='coerce')
  df[timestamp_column] = pd.to_datetime(df[timestamp_column], errors='coerce')
  df[timestamp_column] = pd.to_datetime(df[timestamp_column], errors='coerce')
  df[timestamp_column] = pd.to_datetime(df[timestamp_column], errors='coerce')
  df[timestamp_column] = pd.to_datetime(df[timestamp_column], errors='coerce')
  df[timestamp_column] = pd.to_datetime(df[timestamp_column], errors='coerce')
  df[timestamp_column] = pd.to_datetime(df[timestamp_column], errors='coerce')
  df[timestamp_column] = pd.to_datetime(df[timestamp_column], errors='coerce')
  df[timestamp_column] = pd.to_datetime(df[timestamp_column], errors='coerce')
  df[timestamp_column] = pd.to_datetime(df[timestamp

In [156]:
output_file_path = r"D:\NLP_Proj\cleaned_data_daily.csv"

cleaned_df_daily.to_csv(output_file_path, index=False)
print(f"DataFrame saved as CSV to: {output_file_path}")

DataFrame saved as CSV to: D:\NLP_Proj\cleaned_data_daily.csv


In [None]:
#weekly
cleaned_df = handle_duplicates(weekly_df)
cleaned_df = handle_missing_values(cleaned_df)
cleaned_df = convert_timestamp(cleaned_df, "timestamp")
cleaned_df = combine_text_columns(cleaned_df, "title", "description", "text_for_analysis")
cleaned_df = remove_stop_words_from_column(cleaned_df,"text_for_analysis")
cleaned_df = lemmatize_column(cleaned_df,"text_for_analysis")
cleaned_df_weekly = apply_vader_sentiment(cleaned_df, 'text_for_analysis')

In [158]:
output_file_path = r"D:\NLP_Proj\cleaned_data_weekly.csv"

cleaned_df_weekly.to_csv(output_file_path, index=False)
print(f"DataFrame saved as CSV to: {output_file_path}")

DataFrame saved as CSV to: D:\NLP_Proj\cleaned_data_weekly.csv


# Comments Analysis

In [159]:
daily_df, weekly_df = read_and_combine_csv_comments()


In [160]:
cleaned_df = handle_duplicates(daily_df)
cleaned_df = handle_missing_values(cleaned_df)
cleaned_df = convert_timestamp(cleaned_df, "timestamp")
cleaned_df = clean_comments(cleaned_df)
cleaned_df = remove_stop_words_from_column(cleaned_df,"comment_content")
cleaned_df = lemmatize_column(cleaned_df,"comment_content")
cleaned_df_daily = apply_vader_sentiment(cleaned_df, 'comment_content')

  df[timestamp_column] = pd.to_datetime(df[timestamp_column], errors='coerce')


Number of duplicate entries: 13
Duplicates removed.
Summary of Missing Values (Before Cleaning):
timestamp          0
username           0
comment_content    0
dtype: int64

Total missing values: 0

Rows with missing values removed. Remaining rows: 488
Timestamps converted to datetime format in column: 'timestamp'


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\rutvi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [161]:
output_file_path = r"D:\NLP_Proj\daily_comments_cleaned_data_nnn.csv"

cleaned_df_daily.to_csv(output_file_path, index=False)
print(f"DataFrame saved as CSV to: {output_file_path}")

DataFrame saved as CSV to: D:\NLP_Proj\daily_comments_cleaned_data_nnn.csv


In [162]:
cleaned_df = handle_duplicates(weekly_df)
cleaned_df = handle_missing_values(cleaned_df)
cleaned_df = convert_timestamp(cleaned_df, "timestamp")
cleaned_df = clean_comments(cleaned_df)
cleaned_df = remove_stop_words_from_column(cleaned_df,"comment_content")
cleaned_df = lemmatize_column(cleaned_df,"comment_content")
cleaned_df_weekly = apply_vader_sentiment(cleaned_df, 'comment_content')

  df[timestamp_column] = pd.to_datetime(df[timestamp_column], errors='coerce')


Number of duplicate entries: 13
Duplicates removed.
Summary of Missing Values (Before Cleaning):
timestamp          0
username           0
comment_content    0
dtype: int64

Total missing values: 0

Rows with missing values removed. Remaining rows: 488
Timestamps converted to datetime format in column: 'timestamp'


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\rutvi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [163]:
output_file_path = r"D:\NLP_Proj\weekly_comments_cleaned_data_nnn.csv"

cleaned_df_weekly.to_csv(output_file_path, index=False)
print(f"DataFrame saved as CSV to: {output_file_path}")

DataFrame saved as CSV to: D:\NLP_Proj\weekly_comments_cleaned_data_nnn.csv
