In [44]:
import pandas as pd
import os

def read_and_combine_csv():
    folder_path = r"D:\NLP_Proj\bucket01"
    """
    Reads all CSV files from the given folder and combines them into a single DataFrame.

    Parameters:
    folder_path (str): The path to the folder containing the CSV files.

    Returns:
    pd.DataFrame: A DataFrame containing all the data from the CSV files.
    """
    all_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    combined_df = pd.DataFrame()

    for file in all_files:
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)
        combined_df = pd.concat([combined_df, df], ignore_index=True)

    return combined_df


def handle_duplicates(df):
    """
    Prints the number of duplicate entries in the DataFrame and removes them.

    Parameters:
    df (pd.DataFrame): The DataFrame to check for duplicates.

    Returns:
    pd.DataFrame: A DataFrame with duplicates removed.
    """
    duplicate_count = df.duplicated().sum()
    print(f"Number of duplicate entries: {duplicate_count}")
    
    # Remove duplicates
    df_cleaned = df.drop_duplicates()
    print("Duplicates removed.")
    
    return df_cleaned

def convert_timestamp(df, timestamp_column):
    """
    Converts a timestamp column to a standard datetime format.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the timestamp column.
    timestamp_column (str): The name of the column with timestamp data.

    Returns:
    pd.DataFrame: A DataFrame with the timestamp column converted to datetime.
    """
    df[timestamp_column] = pd.to_datetime(df[timestamp_column], errors='coerce')
    print(f"Timestamps converted to datetime format in column: '{timestamp_column}'")
    return df

def combine_text_columns(df, title_column, description_column, new_column_name="combined_text"):
    """
    Combines the title and description columns into a single text column.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the title and description columns.
    title_column (str): The name of the title column.
    description_column (str): The name of the description column.
    new_column_name (str): The name of the new column to store the combined text.

    Returns:
    pd.DataFrame: A DataFrame with the new combined text column.
    """
    df[new_column_name] = df[title_column].astype(str) + " " + df[description_column].astype(str)
    #print(f"Columns '{title_column}' and '{description_column}' combined into '{new_column_name}'.")
    return df


def handle_missing_values(df):
    """
    Counts missing values in a DataFrame, prints a summary, and removes rows with missing values.

    Parameters:
    df (pd.DataFrame): The DataFrame to check and handle missing values.

    Returns:
    pd.DataFrame: A cleaned DataFrame with rows containing missing values removed.
    """
    # Count missing values per column
    missing_summary = df.isnull().sum()
    total_missing = missing_summary.sum()
    
    print("Summary of Missing Values (Before Cleaning):")
    print(missing_summary)
    print(f"\nTotal missing values: {total_missing}")

    # Remove rows with missing values
    df_cleaned = df.dropna()
    print(f"\nRows with missing values removed. Remaining rows: {len(df_cleaned)}")

    return df_cleaned





In [None]:


combined_df = read_and_combine_csv()
cleaned_df = handle_duplicates(combined_df)
cleaned_df = handle_missing_values(cleaned_df)
cleaned_df = convert_timestamp(cleaned_df, "timestamp")
cleaned_df = combine_text_columns(cleaned_df, "title", "description", "text_for_analysis")


In [46]:
# Import required libraries
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Ensure the VADER lexicon is downloaded
nltk.download('vader_lexicon')

# Initialize the VADER Sentiment Analyzer
sia = SentimentIntensityAnalyzer()

# Apply VADER sentiment analysis
cleaned_df['sentiment_scores'] = cleaned_df['text_for_analysis'].apply(lambda x: sia.polarity_scores(x))
cleaned_df['compound_score'] = cleaned_df['sentiment_scores'].apply(lambda x: x['compound'])
cleaned_df['sentiment'] = cleaned_df['compound_score'].apply(lambda x: 'positive' if x > 0 else ('negative' if x < 0 else 'neutral'))

# Display the updated DataFrame
cleaned_df[['text_for_analysis', 'compound_score', 'sentiment']].head()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\rutvi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,text_for_analysis,compound_score,sentiment
0,Trade setup for Wednesday: Top 15 things to kn...,0.7351,positive
1,Technical View: Bearish Belt Hold formation si...,0.7096,positive
2,"Taking Stock: Market snaps two-day gains, clos...",-0.1063,negative
3,"Nifty, Sensex end flat amid lack of fresh trig...",0.8402,positive
4,"Mid-day | Nifty, Sensex pare gains, trade lowe...",0.8591,positive


In [47]:
output_file_path = r"D:\NLP_Proj\cleaned_data_nnn.csv"

cleaned_df.to_csv(output_file_path, index=False)
print(f"DataFrame saved as CSV to: {output_file_path}")

DataFrame saved as CSV to: D:\NLP_Proj\cleaned_data_nnn.csv
