In [None]:
!pip install pandas textblob tqdm


import pandas as pd
from textblob import TextBlob
from tqdm import tqdm
import nltk


# NLTK (Natural Language Toolkit) is being imported and used to download language data needed for sentiment analysis.
nltk.download('brown')  # Downloads a sample corpus of texts
nltk.download('punkt')  # Downloads a tokenizer that divides a text into sentences or words
nltk.download('averaged_perceptron_tagger')  # Downloads a model for part-of-speech tagging


In [None]:

# 1. Read the CSV file:
# Pandas' `read_csv()` function is used to load the CSV file into a DataFrame (df). 
# It assumes that the CSV file 'input.csv' is in the same directory.
df = pd.read_csv('input.csv')

# 2. Print column names to check if 'Body' exists:
# We print the column names from the loaded DataFrame. This is useful for debugging
# to ensure that the 'Body' column (which contains the text for sentiment analysis) exists.
print("Columns in CSV:", df.columns)

# 3. Create empty DataFrames for clean data and quarantine data:
# We create two empty DataFrames, `clean_df` for storing rows with acceptable content
# and `quarantine_df` for rows with unsavory content.
# These DataFrames have the same structure as the original `df` because we pass `df.columns` to the constructor.
clean_df = pd.DataFrame(columns=df.columns)
quarantine_df = pd.DataFrame(columns=df.columns)

# 4. Iterate over each row of the DataFrame with a progress bar:
# `tqdm` provides a progress bar for long loops. This is helpful for visual feedback during processing.
# `iterrows()` allows us to iterate through each row of the DataFrame.
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
    
    # 5. Get the 'Body' text for sentiment analysis:
    # `row.get('Body')` safely gets the 'Body' column's value in the current row.
    # `get()` ensures that if the column doesn't exist, it won't throw an error (returns None instead).
    text = row.get('Body')  # 'Body' should contain the text content of interest
    
    # 6. Check if the text is a valid string and not NaN:
    # We check if the value in 'Body' is a valid string. Sometimes, the CSV might have missing values (NaNs),
    # and we don't want to analyze those. If `text` is a string, it proceeds to sentiment analysis.
    if isinstance(text, str):
        
        # 7. Perform sentiment analysis using TextBlob:
        # TextBlob is used here for simple sentiment analysis. It computes the polarity of the text.
        # `analysis.sentiment.polarity` returns a float between -1 and 1, where -1 indicates extremely negative,
        # 0 indicates neutral, and 1 indicates extremely positive sentiment.
        analysis = TextBlob(text)
        sentiment = analysis.sentiment.polarity
        # print(f"Row {index} sentiment: {sentiment}")  # Debug print
        
        # 8. Check if the sentiment is unsavory (less than -0.1):
        # This condition checks if the sentiment polarity is below a threshold (e.g., -0.1),
        # which we consider as unsavory. If it is, the row is added to the `quarantine_df`.
        # We use `pd.concat()` to add the row to the `quarantine_df` DataFrame.
        if sentiment < -0.4:  
            quarantine_df = pd.concat([quarantine_df, pd.DataFrame([row])], ignore_index=True)
            # print(f"Row {index} moved to quarantine.")  # Debug print            
        else:
            # 9. If the sentiment is acceptable, add the row to the `clean_df`:
            # If the sentiment is not below the threshold, it means the content is acceptable, 
            # and the row is added to the `clean_df`.
            clean_df = pd.concat([clean_df, pd.DataFrame([row])], ignore_index=True)
            # print(f"Row {index} is clean.")  # Debug print
    
    # 10. Handle cases where the text is missing or not valid:
    # If the 'Body' value isn't a valid string (e.g., it might be NaN), we assume it's unsavory,
    # and move the row to the `quarantine_df` for further analysis.
    else:
        quarantine_df = pd.concat([quarantine_df, pd.DataFrame([row])], ignore_index=True)
        # print(f"Row {index} has no valid text, moved to quarantine.")  # Debug print


# 11. Print the number of rows in each DataFrame:
# After the loop finishes, we print the number of rows in both `quarantine_df` and `clean_df`.
# This ensures that rows are being processed and assigned correctly.
print(f"Number of rows in quarantine: {len(quarantine_df)}")
print(f"Number of rows in clean data: {len(clean_df)}")

# 12. Write the quarantine data to a CSV file:
# This saves the `quarantine_df` to a file named 'quarantine.csv'. Each row flagged as unsavory
# will be written into this file.
quarantine_df.to_csv('quarantine.csv', index=False)
print("Quarantine file written successfully.")

# 13. Write the clean data to another CSV file:
# Similarly, the `clean_df` is saved to 'cleaned.csv', containing all rows that passed the sentiment analysis.
clean_df.to_csv('cleaned.csv', index=False)
print("Cleaned file written successfully.")
