In [13]:
# Task 1: Load the dataset & print schema

import pandas as pd
import os

file_path = 'E:/internship project/data/processed/dataset_with_text_column.xlsx'
df = pd.read_excel(file_path)

print("=" * 80)
print("DATASET SCHEMA INFORMATION")
print("=" * 80)

print(f"\nDataset Shape: {df.shape}")
print(f"  - Rows: {df.shape[0]}")
print(f"  - Columns: {df.shape[1]}")

print("\n" + "=" * 80)
print("Dataset Info:")
print("=" * 80)
df.info()

print("\n" + "=" * 80)
print("First 5 Rows:")
print("=" * 80)
print(df.head())

DATASET SCHEMA INFORMATION

Dataset Shape: (5000, 12)
  - Rows: 5000
  - Columns: 12

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Post_ID           5000 non-null   object        
 1   Post_Date         5000 non-null   datetime64[ns]
 2   Platform          5000 non-null   object        
 3   Hashtag           5000 non-null   object        
 4   Content_Type      5000 non-null   object        
 5   Region            5000 non-null   object        
 6   Views             5000 non-null   int64         
 7   Likes             5000 non-null   int64         
 8   Shares            5000 non-null   int64         
 9   Comments          5000 non-null   int64         
 10  Engagement_Level  5000 non-null   object        
 11  Text              5000 non-null   object        
dtypes: datetime64[ns](1), int64(4), 

In [6]:
# Task 2: Detect and remove duplicates

print("\n" + "=" * 80)
print("TASK 2: DETECT AND REMOVE DUPLICATES")
print("=" * 80)

# Check for duplicates before removal
duplicates_count = df.duplicated().sum()
print(f"\nNumber of duplicate rows (before removal): {duplicates_count}")

# Get duplicate rows by all columns
print(f"\nDuplicate rows by all columns: {df.duplicated().sum()}")

# Get duplicate rows by specific columns (if needed to check specific column duplicates)
print(f"\nDataset shape before removing duplicates: {df.shape}")

# Remove duplicates
df_cleaned = df.drop_duplicates()

print(f"Dataset shape after removing duplicates: {df_cleaned.shape}")

# Count of removed duplicates
removed_count = df.shape[0] - df_cleaned.shape[0]
print(f"\nTotal duplicates removed: {removed_count}")

# Update df to the cleaned version
df = df_cleaned

print("\n" + "=" * 80)
print(f"‚úì Duplicates removed successfully. Dataset now has {df.shape[0]} unique rows.")
print("=" * 80)


TASK 2: DETECT AND REMOVE DUPLICATES

Number of duplicate rows (before removal): 0

Duplicate rows by all columns: 0

Dataset shape before removing duplicates: (5000, 12)
Dataset shape after removing duplicates: (5000, 12)

Total duplicates removed: 0

‚úì Duplicates removed successfully. Dataset now has 5000 unique rows.


In [7]:
# TASK 3: HANDLE MISSING VALUES

print("\n" + "=" * 80)
print("TASK 3: HANDLE MISSING VALUES")
print("=" * 80)

# Generate missing value report
print("\nMissing Value Report:")
print("-" * 80)

missing_report = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum().values,
    'Missing_Percentage': (df.isnull().sum().values / len(df) * 100).round(2)
})

print(missing_report.to_string(index=False))

# Summary statistics
total_missing = df.isnull().sum().sum()
total_cells = df.shape[0] * df.shape[1]
missing_percentage = (total_missing / total_cells * 100).round(2)

print(f"\nTotal missing values: {total_missing}")
print(f"Total cells: {total_cells}")
print(f"Overall missing percentage: {missing_percentage}%")

# Handle missing values based on data type
print("\n" + "-" * 80)
print("Handling missing values:")
print("-" * 80)

df_filled = df.copy()

# For numeric columns: fill with median
numeric_cols = df.select_dtypes(include=['number']).columns
for col in numeric_cols:
    if df_filled[col].isnull().sum() > 0:
        median_val = df_filled[col].median()
        df_filled[col].fillna(median_val, inplace=True)
        print(f"  ‚úì {col}: filled {df[col].isnull().sum()} missing values with median ({median_val:.2f})")

# For categorical columns: fill with mode
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if df_filled[col].isnull().sum() > 0:
        mode_val = df_filled[col].mode()[0]
        df_filled[col].fillna(mode_val, inplace=True)
        print(f"  ‚úì {col}: filled {df[col].isnull().sum()} missing values with mode ('{mode_val}')")

# Verify missing values after handling
print("\n" + "-" * 80)
print("Verification after handling missing values:")
print("-" * 80)
remaining_missing = df_filled.isnull().sum().sum()
print(f"Total remaining missing values: {remaining_missing}")

if remaining_missing == 0:
    print("‚úì All missing values have been handled successfully!")
else:
    print(f"‚ö† Still {remaining_missing} missing values remaining")

# Update df to the cleaned version
df = df_filled

print("\n" + "=" * 80)
print(f"‚úì Missing values handled. Dataset now has {df.shape[0]} rows and {df.shape[1]} columns.")
print("=" * 80)


TASK 3: HANDLE MISSING VALUES

Missing Value Report:
--------------------------------------------------------------------------------
          Column  Missing_Count  Missing_Percentage
         Post_ID              0                 0.0
       Post_Date              0                 0.0
        Platform              0                 0.0
         Hashtag              0                 0.0
    Content_Type              0                 0.0
          Region              0                 0.0
           Views              0                 0.0
           Likes              0                 0.0
          Shares              0                 0.0
        Comments              0                 0.0
Engagement_Level              0                 0.0
            Text              0                 0.0

Total missing values: 0
Total cells: 60000
Overall missing percentage: 0.0%

--------------------------------------------------------------------------------
Handling missing values:
-----

In [16]:
# Task 4 (single cleaned_text column)
print("\n" + "=" * 80)
print("TASK 4 (single column): TEXT NORMALIZATION -> cleaned_text")
print("=" * 80)

import re

def normalize_text(text):
    """Normalize text according to requirements:
    - lowercase
    - remove URLs
    - remove punctuations
    - remove numbers
    - remove extra spaces
    """
    if pd.isna(text):
        return text
    text = str(text)
    # lowercase
    text = text.lower()
    # remove URLs
    text = re.sub(r'http\S+|www\.\S+|https\S+', '', text)
    # remove punctuation (keep alphanumeric and spaces)
    text = re.sub(r'[^\w\s]', ' ', text)
    # remove numbers
    text = re.sub(r'\d+', '', text)
    # replace underscores (from \w) and collapse spaces
    text = text.replace('_', ' ')
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Choose a primary raw text column to create `cleaned_text` from
candidates = [col for col in df.columns 
              if df[col].dtype == 'object' 
              and '_normalized' not in col 
              and '_no_stopwords' not in col 
              and '_hashtags' not in col]

if candidates:
    primary_text_col = "Hashtag"
    print(f"Using primary text column: {primary_text_col} -> creating 'cleaned_text'.")
    df['cleaned_text'] = df[primary_text_col].apply(normalize_text)
    print(f"\nDataset shape after adding 'cleaned_text': {df.shape}")
    print("\nSample cleaned_text (first 5 rows):")
    print(df['cleaned_text'].head().to_string(index=False))
else:
    print("No suitable text column found to create 'cleaned_text'.")

print("\n" + "=" * 80)


TASK 4 (single column): TEXT NORMALIZATION -> cleaned_text
Using primary text column: Hashtag -> creating 'cleaned_text'.

Dataset shape after adding 'cleaned_text': (5000, 14)

Sample cleaned_text (first 5 rows):
challenge
education
challenge
education
    dance



In [15]:
# Task 5: Remove stopwords
print("\n" + "=" * 80)
print("TASK 5: REMOVE STOPWORDS")
print("=" * 80)

# Import nltk stopwords
import nltk
from nltk.corpus import stopwords
import pandas as pd

# Download stopwords if not already present
try:
    stopwords.words('english')
except LookupError:
    print("Downloading NLTK stopwords...")
    nltk.download('stopwords')

# Get English stopwords
stop_words = set(stopwords.words('english'))

# >>> IMPORTANT CHANGE: Keep the word "not"
if "not" in stop_words:
    stop_words.remove("not")

print(f"\nLoaded {len(stop_words)} English stopwords (excluding 'not')")
print(f"Sample stopwords: {list(stop_words)[:10]}")

# Define function to remove stopwords
def remove_stopwords(text):
    """Remove stopwords from text but keep 'not'."""
    if pd.isna(text) or text == '':
        return text
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Apply to Text column
if 'Text' in df.columns:
    print("\nApplying stopword removal to 'Text'...")
    df['text_no_stopwords'] = df['Text'].apply(remove_stopwords)
    print(f"‚úì Created 'text_no_stopwords' column")
    print(f"Dataset shape: {df.shape}")
    
    # Show comparison
    print("\n" + "-" * 80)
    print("Sample comparison (Text vs text_no_stopwords):")
    print("-" * 80)
    
    sample_df = df[['Text', 'text_no_stopwords']].head(5)
    for idx, row in sample_df.iterrows():
        print(f"\nRow {idx}:")
        print(f"  cleaned_text: {row['Text'][:80]}...")
        print(f"  text_no_stopwords: {row['text_no_stopwords'][:80]}...")
    
    # Statistics
    print("\n" + "-" * 80)
    print("Statistics:")
    print("-" * 80)
    empty_count = (df['text_no_stopwords'] == '').sum()
    non_empty_count = len(df) - empty_count
    print(f"Non-empty text_no_stopwords: {non_empty_count} / {len(df)}")
    print(f"Empty text_no_stopwords: {empty_count} / {len(df)}")
    
else:
    print("‚ö† 'Text' column not found. Please run Task 4 first.")

print("\n" + "=" * 80)
print("‚úì Task 5 completed: Stopwords removed successfully (NOT preserved)!")
print("=" * 80)



TASK 5: REMOVE STOPWORDS

Loaded 197 English stopwords (excluding 'not')
Sample stopwords: ["she's", 'mustn', "isn't", "it's", 'where', 'did', "should've", 'y', 'hadn', 'whom']

Applying stopword removal to 'Text'...
‚úì Created 'text_no_stopwords' column
Dataset shape: (5000, 13)

--------------------------------------------------------------------------------
Sample comparison (Text vs text_no_stopwords):
--------------------------------------------------------------------------------

Row 0:
  cleaned_text: I regret choosing this....
  text_no_stopwords: regret choosing this....

Row 1:
  cleaned_text: I regret choosing this....
  text_no_stopwords: regret choosing this....

Row 2:
  cleaned_text: It failed to meet my expectations....
  text_no_stopwords: failed meet expectations....

Row 3:
  cleaned_text: I expected more from this product....
  text_no_stopwords: expected product....

Row 4:
  cleaned_text: This product is amazing!...
  text_no_stopwords: product amazing!...

---

In [10]:
# Task 6: Extract hashtags
print("\n" + "=" * 80)
print("TASK 6: EXTRACT HASHTAGS")
print("=" * 80)

import re

# Define function to extract hashtags
def extract_hashtags(text):
    """Extract all hashtags from text."""
    if pd.isna(text):
        return []
    text = str(text)
    # Match hashtag pattern: # followed by word characters
    hashtags = re.findall(r'#\w+', text)
    return hashtags

# Find the primary text column (raw text, not processed)
candidates = [col for col in df.columns 
              if df[col].dtype == 'object' 
              and '_normalized' not in col 
              and '_no_stopwords' not in col 
              and 'cleaned' not in col]

if candidates:
    raw_text_col = "Hashtag"
    print(f"Extracting hashtags from column: '{raw_text_col}'")
    df['hashtags'] = df[raw_text_col].apply(extract_hashtags)
    print(f"‚úì Created 'hashtags' column")
    print(f"Dataset shape: {df.shape}")
    
    # Statistics
    print("\n" + "-" * 80)
    print("Hashtag Extraction Statistics:")
    print("-" * 80)
    
    total_rows = len(df)
    rows_with_hashtags = (df['hashtags'].apply(len) > 0).sum()
    rows_without_hashtags = total_rows - rows_with_hashtags
    total_hashtags = sum(len(h) for h in df['hashtags'])
    avg_hashtags_per_row = total_hashtags / total_rows if total_rows > 0 else 0
    
    print(f"Total rows: {total_rows}")
    print(f"Rows with hashtags: {rows_with_hashtags} ({rows_with_hashtags/total_rows*100:.2f}%)")
    print(f"Rows without hashtags: {rows_without_hashtags} ({rows_without_hashtags/total_rows*100:.2f}%)")
    print(f"Total hashtags extracted: {total_hashtags}")
    print(f"Average hashtags per row: {avg_hashtags_per_row:.2f}")
    
    # Show samples
    print("\n" + "-" * 80)
    print("Sample hashtags (first 10 rows with hashtags):")
    print("-" * 80)
    
    rows_with_tags = df[df['hashtags'].apply(len) > 0].head(10)
    for idx, (i, row) in enumerate(rows_with_tags.iterrows(), 1):
        hashtag_list = ', '.join(row['hashtags'][:5])  # Show max 5 hashtags
        if len(row['hashtags']) > 5:
            hashtag_list += f", ... (+{len(row['hashtags']) - 5} more)"
        print(f"{idx}. {hashtag_list}")
    
else:
    print("‚ö† No suitable raw text column found to extract hashtags.")

print("\n" + "=" * 80)
print("‚úì Task 6 completed: Hashtags extracted successfully!")
print("=" * 80)


TASK 6: EXTRACT HASHTAGS
Extracting hashtags from column: 'Hashtag'
‚úì Created 'hashtags' column
Dataset shape: (5000, 15)

--------------------------------------------------------------------------------
Hashtag Extraction Statistics:
--------------------------------------------------------------------------------
Total rows: 5000
Rows with hashtags: 5000 (100.00%)
Rows without hashtags: 0 (0.00%)
Total hashtags extracted: 5000
Average hashtags per row: 1.00

--------------------------------------------------------------------------------
Sample hashtags (first 10 rows with hashtags):
--------------------------------------------------------------------------------
1. #Challenge
2. #Education
3. #Challenge
4. #Education
5. #Dance
6. #Challenge
7. #Comedy
8. #Gaming
9. #Education
10. #Gaming

‚úì Task 6 completed: Hashtags extracted successfully!


In [11]:
# Task 7: Calculate sentiment scores
print("\n" + "=" * 80)
print("TASK 7: CALCULATE SENTIMENT SCORES (POLARITY & SUBJECTIVITY)")
print("=" * 80)

# Try to import TextBlob, install if missing
try:
    from textblob import TextBlob
except Exception:
    import sys, subprocess
    print("TextBlob not found. Installing textblob...")
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'textblob'])
    from textblob import TextBlob
    # Ensure punkt tokenizer available
    try:
        import nltk
        nltk.data.find('tokenizers/punkt')
    except Exception:
        import nltk
        print('Downloading NLTK punkt tokenizer...')
        nltk.download('punkt')

# Choose input column: prefer 'text_no_stopwords', then 'cleaned_text', then common raw text names
candidates = ['Text', 'cleaned_text','Engagement_Level', 'Platform', 'Hashtag', 'Content_Type', 'Region', 'Views', 'Likes', 'Shares']
input_col = 'Text'
for c in candidates:
    if c in df.columns:
        input_col = c
        break

if input_col is None:
    raise ValueError('No suitable text column found for sentiment calculation. Please create `cleaned_text` or `text_no_stopwords` first.')

print(f"Using '{input_col}' as input for sentiment calculation")

# Function to compute polarity & subjectivity
def compute_sentiment(text):
    if pd.isna(text) or str(text).strip() == '':
        return (None, None)
    try:
        tb = TextBlob(str(text))
        try:
            tb = tb.translate(to="en")
        except:
            pass
        return tb.sentiment.polarity, tb.sentiment.subjectivity
    except:
        return (None, None)

# Apply to the dataframe
sentiments = df[input_col].fillna('').apply(lambda t: compute_sentiment(t))

df['polarity'] = sentiments.apply(lambda x: x[0])
df['subjectivity'] = sentiments.apply(lambda x: x[1])

# Quick stats and sample
print(f"\nAdded columns: 'polarity' and 'subjectivity' to DataFrame. Dataset shape: {df.shape}")

print("\nSample sentiment scores (first 10 rows):")
print(df[[input_col, 'polarity', 'subjectivity']].head(10).to_string(index=False))

# Summary statistics
valid_polarity = df['polarity'].dropna()
valid_subjectivity = df['subjectivity'].dropna()
print("\n" + "-" * 80)
print("Sentiment summary:")
print("-" * 80)
if len(valid_polarity) > 0:
    print(f"Polarity: mean={valid_polarity.mean():.4f}, min={valid_polarity.min():.4f}, max={valid_polarity.max():.4f}")
else:
    print("No valid polarity values computed.")

if len(valid_subjectivity) > 0:
    print(f"Subjectivity: mean={valid_subjectivity.mean():.4f}, min={valid_subjectivity.min():.4f}, max={valid_subjectivity.max():.4f}")
else:
    print("No valid subjectivity values computed.")

print("\n" + "=" * 80)
print("‚úì Task 7 completed: Sentiment scores calculated.")
print("=" * 80)


TASK 7: CALCULATE SENTIMENT SCORES (POLARITY & SUBJECTIVITY)
Using 'Text' as input for sentiment calculation

Added columns: 'polarity' and 'subjectivity' to DataFrame. Dataset shape: (5000, 17)

Sample sentiment scores (first 10 rows):
                               Text  polarity  subjectivity
            I regret choosing this.      0.00          0.00
            I regret choosing this.      0.00          0.00
 It failed to meet my expectations.     -0.50          0.30
 I expected more from this product.      0.20          0.45
           This product is amazing!      0.75          0.90
 It failed to meet my expectations.     -0.50          0.30
              Such a waste of time.     -0.10          0.25
Would not recommend this to others.      0.00          0.00
            I regret choosing this.      0.00          0.00
 This could definitely be improved.      0.00          0.50

--------------------------------------------------------------------------------
Sentiment summary:
-

In [13]:
# Task 8: Export cleaned dataset
print("\n" + "=" * 80)
print("TASK 8: EXPORT CLEANED DATASET")
print("=" * 80)

import os

# Create output directory if it doesn't exist
output_dir = 'E:/internship project/data/processed'
os.makedirs(output_dir, exist_ok=True)

# Define output file path
output_file = os.path.join(output_dir, 'clean_data.csv')

# Export to CSV
print(f"\nExporting cleaned DataFrame to: {output_file}")
df.to_csv(output_file, index=False, encoding='utf-8')

# Verify export
if os.path.exists(output_file):
    file_size = os.path.getsize(output_file) / (1024 * 1024)  # Convert to MB
    print(f"‚úì Export successful!")
    print(f"  - File path: {output_file}")
    print(f"  - File size: {file_size:.2f} MB")
    print(f"  - Rows: {len(df)}")
    print(f"  - Columns: {len(df.columns)}")
    print(f"\n  Columns in cleaned dataset:")
    for i, col in enumerate(df.columns, 1):
        print(f"    {i}. {col}")
else:
    print(f"‚ö† Export failed. File not found at {output_file}")

print("\n" + "=" * 80)
print("‚úì Task 8 completed: Cleaned dataset exported successfully!")
print("=" * 80)


TASK 8: EXPORT CLEANED DATASET

Exporting cleaned DataFrame to: E:/internship project/data/processed\clean_data.csv
‚úì Export successful!
  - File path: E:/internship project/data/processed\clean_data.csv
  - File size: 0.79 MB
  - Rows: 5000
  - Columns: 17

  Columns in cleaned dataset:
    1. Post_ID
    2. Post_Date
    3. Platform
    4. Hashtag
    5. Content_Type
    6. Region
    7. Views
    8. Likes
    9. Shares
    10. Comments
    11. Engagement_Level
    12. Text
    13. cleaned_text
    14. text_no_stopwords
    15. hashtags
    16. polarity
    17. subjectivity

‚úì Task 8 completed: Cleaned dataset exported successfully!


In [14]:
# Task 9: Generate unigrams/bigrams/trigrams
print("\n" + "=" * 80)
print("TASK 9: GENERATE N-GRAMS (UNIGRAMS, BIGRAMS, TRIGRAMS)")
print("=" * 80)

from collections import Counter
from nltk.util import ngrams

# Define sentiment thresholds
# Positive: polarity > 0.1, Negative: polarity < -0.1, Neutral: in between
positive_threshold = 0.1
negative_threshold = -0.1

# Categorize rows by sentiment
df['sentiment_category'] = df['polarity'].apply(
    lambda x: 'positive' if x > positive_threshold 
              else ('negative' if x < negative_threshold else 'neutral')
)

print(f"\nSentiment distribution:")
print(f"  Positive (polarity > {positive_threshold}): {(df['sentiment_category'] == 'positive').sum()}")
print(f"  Negative (polarity < {negative_threshold}): {(df['sentiment_category'] == 'negative').sum()}")
print(f"  Neutral: {(df['sentiment_category'] == 'neutral').sum()}")

# Function to generate n-grams
def generate_ngrams(text, n):
    """Generate n-grams from text."""
    if pd.isna(text) or str(text).strip() == '':
        return []
    words = str(text).split()
    return list(ngrams(words, n))

# Generate n-grams for positive and negative sentiments
print("\n" + "-" * 80)
print("Generating N-grams for Positive Sentiment:")
print("-" * 80)

positive_texts = df[df['sentiment_category'] == 'positive']['text_no_stopwords']
positive_unigrams = Counter()
positive_bigrams = Counter()
positive_trigrams = Counter()

for text in positive_texts:
    positive_unigrams.update(generate_ngrams(text, 1))
    positive_bigrams.update(generate_ngrams(text, 2))
    positive_trigrams.update(generate_ngrams(text, 3))

print(f"Top 10 Unigrams (Positive):")
for gram, freq in positive_unigrams.most_common(10):
    print(f"  {gram[0]}: {freq}")

print(f"\nTop 10 Bigrams (Positive):")
for gram, freq in positive_bigrams.most_common(10):
    print(f"  {' '.join(gram)}: {freq}")

print(f"\nTop 10 Trigrams (Positive):")
for gram, freq in positive_trigrams.most_common(10):
    print(f"  {' '.join(gram)}: {freq}")

# Generate n-grams for negative sentiment
print("\n" + "-" * 80)
print("Generating N-grams for Negative Sentiment:")
print("-" * 80)

negative_texts = df[df['sentiment_category'] == 'negative']['text_no_stopwords']
negative_unigrams = Counter()
negative_bigrams = Counter()
negative_trigrams = Counter()

for text in negative_texts:
    negative_unigrams.update(generate_ngrams(text, 1))
    negative_bigrams.update(generate_ngrams(text, 2))
    negative_trigrams.update(generate_ngrams(text, 3))

print(f"Top 10 Unigrams (Negative):")
for gram, freq in negative_unigrams.most_common(10):
    print(f"  {gram[0]}: {freq}")

print(f"\nTop 10 Bigrams (Negative):")
for gram, freq in negative_bigrams.most_common(10):
    print(f"  {' '.join(gram)}: {freq}")

print(f"\nTop 10 Trigrams (Negative):")
for gram, freq in negative_trigrams.most_common(10):
    print(f"  {' '.join(gram)}: {freq}")

# Create frequency DataFrames for export
pos_unigram_df = pd.DataFrame(
    [(gram[0], freq) for gram, freq in positive_unigrams.most_common(50)],
    columns=['unigram', 'frequency']
)
pos_bigram_df = pd.DataFrame(
    [(' '.join(gram), freq) for gram, freq in positive_bigrams.most_common(50)],
    columns=['bigram', 'frequency']
)
pos_trigram_df = pd.DataFrame(
    [(' '.join(gram), freq) for gram, freq in positive_trigrams.most_common(50)],
    columns=['trigram', 'frequency']
)

neg_unigram_df = pd.DataFrame(
    [(gram[0], freq) for gram, freq in negative_unigrams.most_common(50)],
    columns=['unigram', 'frequency']
)
neg_bigram_df = pd.DataFrame(
    [(' '.join(gram), freq) for gram, freq in negative_bigrams.most_common(50)],
    columns=['bigram', 'frequency']
)
neg_trigram_df = pd.DataFrame(
    [(' '.join(gram), freq) for gram, freq in negative_trigrams.most_common(50)],
    columns=['trigram', 'frequency']
)

print("\n" + "-" * 80)
print("Summary:")
print("-" * 80)
print(f"Positive unigrams: {len(positive_unigrams)}")
print(f"Positive bigrams: {len(positive_bigrams)}")
print(f"Positive trigrams: {len(positive_trigrams)}")
print(f"Negative unigrams: {len(negative_unigrams)}")
print(f"Negative bigrams: {len(negative_bigrams)}")
print(f"Negative trigrams: {len(negative_trigrams)}")

print("\n" + "=" * 80)
print("‚úì Task 9 completed: N-grams generated and analyzed!")
print("=" * 80)


TASK 9: GENERATE N-GRAMS (UNIGRAMS, BIGRAMS, TRIGRAMS)

Sentiment distribution:
  Positive (polarity > 0.1): 3037
  Negative (polarity < -0.1): 985
  Neutral: 978

--------------------------------------------------------------------------------
Generating N-grams for Positive Sentiment:
--------------------------------------------------------------------------------
Top 10 Unigrams (Positive):
  experience: 543
  could: 275
  better.: 275
  really: 273
  loved: 273
  using: 273
  this!: 273
  Absolutely: 268
  worth: 268
  it!: 268

Top 10 Bigrams (Positive):
  experience could: 275
  could better.: 275
  really loved: 273
  loved using: 273
  using this!: 273
  Absolutely worth: 268
  worth it!: 268
  Fantastic experience: 268
  experience overall.: 268
  Better thought.: 260

Top 10 Trigrams (Positive):
  experience could better.: 275
  really loved using: 273
  loved using this!: 273
  Absolutely worth it!: 268
  Fantastic experience overall.: 268
  worked perfectly me.: 254
  One 

In [15]:
# Task 10: TF-IDF Analysis (Overall Corpus)
print("\n" + "=" * 80)
print("TASK 10: TF-IDF ANALYSIS (OVERALL TEXT)")
print("=" * 80)

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Detect text column
text_col = None
candidates = ['cleaned_text', 'text', 'tweet', 'content', 'post', 'message', 'caption', 'body']
for col in candidates:
    if col in df.columns:
        text_col = col
        break

if text_col is None:
    raise ValueError("No suitable text column found.")

print(f"Using '{text_col}' column")

# Prepare text data
texts = df[text_col].dropna().astype(str).tolist()

if len(texts) == 0:
    raise ValueError("Text column is empty.")

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(
    stop_words='english',
    min_df=2,
    max_df=0.8,
    max_features=1000
)

X = vectorizer.fit_transform(texts)
feature_names = np.array(vectorizer.get_feature_names_out())

# Mean TF-IDF
mean_tfidf = np.asarray(X.mean(axis=0)).flatten()

# Top keywords
top_n = 20
top_idx = mean_tfidf.argsort()[::-1][:top_n]

print("\n" + "-" * 80)
print("Top TF-IDF Keywords (Overall Dataset):")
print("-" * 80)
for i in top_idx:
    print(f"{feature_names[i]} : {mean_tfidf[i]:.4f}")

print("\n" + "=" * 80)
print("‚úì Task 10 completed successfully (overall TF-IDF).")
print("=" * 80)



TASK 10: TF-IDF ANALYSIS (OVERALL TEXT)
Using 'cleaned_text' column

--------------------------------------------------------------------------------
Top TF-IDF Keywords (Overall Dataset):
--------------------------------------------------------------------------------
fitness : 0.1072
education : 0.1050
challenge : 0.1014
comedy : 0.1010
dance : 0.0992
music : 0.0986
tech : 0.0982
fashion : 0.0974
viral : 0.0962
gaming : 0.0958

‚úì Task 10 completed successfully (overall TF-IDF).


In [16]:
# Task 11: Emoji extraction + sentiment mapping
print("\n" + "=" * 80)
print("TASK 11: EMOJI EXTRACTION + SENTIMENT MAPPING")
print("=" * 80)

import re

# Try to import emoji library, install if missing
try:
    import emoji
except Exception:
    import sys, subprocess
    print("emoji library not found. Installing emoji...")
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'emoji'])
    import emoji

# Find raw text column (original unprocessed text with emojis)
raw_text_col = None
candidates = [col for col in df.columns 
              if df[col].dtype == 'object' 
              and 'cleaned' not in col 
              and '_no_stopwords' not in col 
              and 'hashtags' not in col]

if candidates:
    raw_text_col = candidates[0]
    print(f"Using '{raw_text_col}' column for emoji extraction")
else:
    raise ValueError('No suitable raw text column found for emoji extraction.')

# Function to extract emojis from text
def extract_emojis(text):
    """Extract all emojis from text."""
    if pd.isna(text):
        return []
    text = str(text)
    return [char for char in text if char in emoji.EMOJI_DATA]

# Extract emojis
print("\nExtracting emojis from text...")
df['emojis'] = df[raw_text_col].apply(extract_emojis)

# Build sentiment mapping
emoji_sentiment_map = {}
emoji_counts = {}

for idx, row in df.iterrows():
    emojis_in_text = row['emojis']
    polarity = row['polarity']
    
    for em in emojis_in_text:
        if em not in emoji_sentiment_map:
            emoji_sentiment_map[em] = []
            emoji_counts[em] = 0
        emoji_sentiment_map[em].append(polarity)
        emoji_counts[em] += 1

# Build summary list
emoji_sentiment_summary = []
for em, sentiments in emoji_sentiment_map.items():
    avg_sentiment = np.mean(sentiments)
    std_sentiment = np.std(sentiments)
    count = len(sentiments)

    try:
        emoji_desc = emoji.demojize(em)
    except:
        emoji_desc = "unknown"

    emoji_sentiment_summary.append({
        'emoji': em,
        'description': emoji_desc,
        'avg_polarity': avg_sentiment,
        'std_polarity': std_sentiment,
        'count': count
    })

# Convert to DataFrame with safe checks
emoji_df = pd.DataFrame(emoji_sentiment_summary)

# FIX: handle empty dataframe safely
if emoji_df.empty or 'count' not in emoji_df.columns:
    print("\n‚ö† No emojis detected ‚Äî emoji sentiment analysis skipped.")
    print("Total unique emojis found: 0")
    print("Total emoji occurrences: 0")
    emoji_df = pd.DataFrame()   # keep empty
else:
    emoji_df = emoji_df.sort_values('count', ascending=False)
    print(f"\nTotal unique emojis found: {len(emoji_df)}")
    print(f"Total emoji occurrences: {emoji_df['count'].sum()}")

# Display top emojis
print("\n" + "-" * 80)
print("Top 20 Emojis by Frequency + Sentiment Score:")
print("-" * 80)

if not emoji_df.empty:
    display_df = emoji_df.head(20)[['emoji', 'description', 'avg_polarity', 'count']]
    for idx, row in display_df.iterrows():
        emoji_char = row['emoji']
        desc = row['description']
        avg_pol = row['avg_polarity']
        count = row['count']
        sentiment_label = (
            "Positive" if avg_pol > 0.1 
            else ("Negative" if avg_pol < -0.1 else "Neutral")
        )
        print(f"{emoji_char} {desc}: avg_polarity={avg_pol:.4f} ({sentiment_label}), count={count}")
else:
    print("No emojis found in the dataset.")

# Statistics
print("\n" + "-" * 80)
print("Emoji Sentiment Statistics:")
print("-" * 80)

if not emoji_df.empty:
    print(f"Average polarity of emojis: {emoji_df['avg_polarity'].mean():.4f}")
    print(f"Most positive emoji avg_polarity: {emoji_df['avg_polarity'].max():.4f}")
    print(f"Most negative emoji avg_polarity: {emoji_df['avg_polarity'].min():.4f}")

    positive_emojis = len(emoji_df[emoji_df['avg_polarity'] > 0.1])
    negative_emojis = len(emoji_df[emoji_df['avg_polarity'] < -0.1])
    neutral_emojis = len(emoji_df[(emoji_df['avg_polarity'] >= -0.1) & (emoji_df['avg_polarity'] <= 0.1)])

    print(f"Positive emojis: {positive_emojis}")
    print(f"Negative emojis: {negative_emojis}")
    print(f"Neutral emojis: {neutral_emojis}")
else:
    print("No emojis found.")

print("\n" + "=" * 80)
print("‚úì Task 11 completed: Emojis extracted and sentiment mapped!")
print("=" * 80)



TASK 11: EMOJI EXTRACTION + SENTIMENT MAPPING
Using 'Post_ID' column for emoji extraction

Extracting emojis from text...

‚ö† No emojis detected ‚Äî emoji sentiment analysis skipped.
Total unique emojis found: 0
Total emoji occurrences: 0

--------------------------------------------------------------------------------
Top 20 Emojis by Frequency + Sentiment Score:
--------------------------------------------------------------------------------
No emojis found in the dataset.

--------------------------------------------------------------------------------
Emoji Sentiment Statistics:
--------------------------------------------------------------------------------
No emojis found.

‚úì Task 11 completed: Emojis extracted and sentiment mapped!


In [17]:
# Task 12: Hashtag frequency analysis
print("\n" + "=" * 80)
print("TASK 12: HASHTAG FREQUENCY ANALYSIS")
print("=" * 80)

from collections import Counter

# Extract and flatten all hashtags from the hastags column 
all_hashtags = []
for hashtag_list in df['hashtags']:
    if isinstance(hashtag_list, list):
        all_hashtags.extend(hashtag_list)

# count hastags frequencies
hashtag_counter = Counter(all_hashtags)

# create summary dataframe 
hashtag_freq_df = pd.DataFrame(
    list(hashtag_counter.most_common()),
    columns=['hashtag', 'count']
)

# add percentage column 
total_hashtags = hashtag_freq_df['count'].sum()
hashtag_freq_df['percentage'] = (hashtag_freq_df['count'] / total_hashtags * 100).round(2)

print(f"\nTotal unique hashtags: {len(hashtag_freq_df)}")
print(f"Total hashtag occurrences: {total_hashtags}")

# Display top 30 hashtags
print("\n" + "-" * 80)
print("Top 30 Hashtags by Frequency:")
print("-" * 80)

top_30 = hashtag_freq_df.head(30)
for idx, row in top_30.iterrows():
    hashtag = row['hashtag']
    count = row['count']
    percentage = row['percentage']
    print(f"{hashtag}: {count} ({percentage}%)")

# statistics
print("\n" + "-" * 80)
print("Hashtag Frequency Statistics:")
print("-" * 80)
print(f"Most frequent hashtag: {hashtag_freq_df.iloc[0]['hashtag']} ({hashtag_freq_df.iloc[0]['count']} occurrences)")
print(f"Average hashtag frequency: {hashtag_freq_df['count'].mean():.2f}")
print(f"Median hashtag frequency: {hashtag_freq_df['count'].median():.2f}")
print(f"Max frequency: {hashtag_freq_df['count'].max()}")
print(f"Min frequency: {hashtag_freq_df['count'].min()}")

# Trending analysis: hashtags appearing more than once
trending_threshold = 2
trending_hashtags = hashtag_freq_df[hashtag_freq_df['count'] >= trending_threshold]
print(f"\nHashtags with 2+ occurrences (trending): {len(trending_hashtags)}")
print(f"Hashtags appearing only once: {len(hashtag_freq_df[hashtag_freq_df['count'] == 1])}")

# top hastags contributions   
top_10_contribution = top_30['count'].sum() / total_hashtags * 100
print(f"\nTop 10 hashtags contribute: {top_10_contribution:.2f}% of all hashtag occurrences")

print("\n" + "=" * 80)
print("‚úì Task 12 completed: Hashtag frequency analysis completed!")
print("=" * 80)


TASK 12: HASHTAG FREQUENCY ANALYSIS

Total unique hashtags: 10
Total hashtag occurrences: 5000

--------------------------------------------------------------------------------
Top 30 Hashtags by Frequency:
--------------------------------------------------------------------------------
#Fitness: 536 (10.72%)
#Education: 525 (10.5%)
#Challenge: 507 (10.14%)
#Comedy: 505 (10.1%)
#Dance: 496 (9.92%)
#Music: 493 (9.86%)
#Tech: 491 (9.82%)
#Fashion: 487 (9.74%)
#Viral: 481 (9.62%)
#Gaming: 479 (9.58%)

--------------------------------------------------------------------------------
Hashtag Frequency Statistics:
--------------------------------------------------------------------------------
Most frequent hashtag: #Fitness (536 occurrences)
Average hashtag frequency: 500.00
Median hashtag frequency: 494.50
Max frequency: 536
Min frequency: 479

Hashtags with 2+ occurrences (trending): 10
Hashtags appearing only once: 0

Top 10 hashtags contribute: 100.00% of all hashtag occurrences

‚úì Ta

In [18]:
# =============================================================================
# Task 13: Hashtag Sentiment Analysis
# =============================================================================
print("\n" + "=" * 80)
print("TASK 13: HASHTAG SENTIMENT ANALYSIS")
print("=" * 80)

import os
import pandas as pd

# -----------------------------
# Safety checks
# -----------------------------
if 'hashtags' not in df.columns:
    raise ValueError("Column 'hashtags' not found. Run Task 6 first.")
if 'polarity' not in df.columns:
    raise ValueError("Column 'polarity' not found. Run Task 7 first.")

# -----------------------------
# Explode hashtags
# -----------------------------
expanded = df[['hashtags', 'polarity']].explode('hashtags')
expanded = expanded.dropna(subset=['hashtags'])

# Normalize hashtags
expanded['hashtag_norm'] = expanded['hashtags'].astype(str).str.lower().str.strip()

# -----------------------------
# Aggregate sentiment per hastag
# -----------------------------
hashtag_sentiment = (
    expanded
    .groupby('hashtag_norm')
    .agg(
        avg_polarity=('polarity', 'mean'),
        std_polarity=('polarity', 'std'),
        count=('polarity', 'size')   # size() is safer than count()
    )
    .reset_index()
)

# -----------------------------
# Percentage calculation
# -----------------------------
total_hashtags = hashtag_sentiment['count'].sum()
hashtag_sentiment['percentage'] = (
    hashtag_sentiment['count'] / total_hashtags * 100
).round(2)

# -----------------------------
# Sort by frequency
# -----------------------------
hashtag_sentiment = hashtag_sentiment.sort_values(
    by='count',
    ascending=False
).reset_index(drop=True)

# -----------------------------
# Save output (safe path)
# -----------------------------
out_dir = os.path.join(os.getcwd(), 'data', 'processed')
os.makedirs(out_dir, exist_ok=True)

out_file = os.path.join(out_dir, 'hashtag_sentiment.csv')
hashtag_sentiment.to_csv(out_file, index=False, encoding='utf-8')

# -----------------------------
# Display results
# -----------------------------
print(f"\nSaved hashtag sentiment table to:\n{out_file}")
print(f"Total unique hashtags: {len(hashtag_sentiment)}")

print("\n" + "-" * 80)
print("Top 30 hashtags with sentiment:")
print("-" * 80)
print(hashtag_sentiment.head(30).to_string(index=False))

print("\n" + "=" * 80)
print("‚úì Task 13 completed successfully!")
print("=" * 80)



TASK 13: HASHTAG SENTIMENT ANALYSIS

Saved hashtag sentiment table to:
C:\Users\dell\data\processed\hashtag_sentiment.csv
Total unique hashtags: 10

--------------------------------------------------------------------------------
Top 30 hashtags with sentiment:
--------------------------------------------------------------------------------
hashtag_norm  avg_polarity  std_polarity  count  percentage
    #fitness      0.296679      0.505280    536       10.72
  #education      0.270190      0.514876    525       10.50
  #challenge      0.230424      0.503382    507       10.14
     #comedy      0.280634      0.471777    505       10.10
      #dance      0.275081      0.491044    496        9.92
      #music      0.255832      0.497994    493        9.86
       #tech      0.255621      0.490553    491        9.82
    #fashion      0.284569      0.474373    487        9.74
      #viral      0.300644      0.490059    481        9.62
     #gaming      0.244843      0.501876    479        9

In [19]:
# ================================
# TASK 14 - CREATE POWER BI TABLES
# ================================

import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# --------------------
# Set Output Directory
# --------------------
OUT_DIR = r"E:/internship project/data/processed"
os.makedirs(OUT_DIR, exist_ok=True)

# --------------------
# df must already exist
# Required columns:
# polarity
# hashtags  (list)
# emoji     (list)
# *_no_stopwords text
# --------------------

# ================================
# 1Ô∏è‚É£ SENTIMENT SUMMARY TABLE
# ================================
polarity_cols = [c for c in df.columns if "polarity" in c.lower()]
if not polarity_cols:
    raise ValueError("‚ùå No polarity column found")

polarity_col = polarity_cols[0]

sentiment_summary = pd.DataFrame({
    "sentiment": ["Positive", "Negative", "Neutral"],
    "count": [
        (df[polarity_col] > 0.1).sum(),
        (df[polarity_col] < -0.1).sum(),
        ((df[polarity_col] >= -0.1) & (df[polarity_col] <= 0.1)).sum()
    ]
})

sentiment_summary["percentage"] = (
    sentiment_summary["count"] / sentiment_summary["count"].sum() * 100
).round(2)

sentiment_summary.to_csv(os.path.join(OUT_DIR, "sentiment_summary.csv"), index=False)
print("‚úì sentiment_summary.csv created")

# ================================
# 2Ô∏è‚É£ HASHTAG SENTIMENT TABLE
# ================================
hashtag_cols = [c for c in df.columns if "hashtag" in c.lower()]
if not hashtag_cols:
    raise ValueError("‚ùå No hashtag column found")

hashtag_col = hashtag_cols[0]

expanded = df[[hashtag_col, polarity_col]].explode(hashtag_col).dropna(subset=[hashtag_col])
expanded["hashtag"] = expanded[hashtag_col].astype(str).str.lower()

hashtag_sentiment = (
    expanded.groupby("hashtag")
    .agg(
        avg_polarity=(polarity_col, "mean"),
        std_polarity=(polarity_col, "std"),
        count=(polarity_col, "size")
    )
    .reset_index()
)

total = hashtag_sentiment["count"].sum()
hashtag_sentiment["percentage"] = (hashtag_sentiment["count"] / total * 100).round(2)

hashtag_sentiment = hashtag_sentiment.sort_values("count", ascending=False)

hashtag_sentiment.to_csv(os.path.join(OUT_DIR, "hashtag_sentiment.csv"), index=False)
print("‚úì hashtag_sentiment.csv created")

# ================================
# 3Ô∏è‚É£ KEYWORD DRIVERS (TF-IDF)
# ================================
text_cols = [c for c in df.columns if "no_stopwords" in c.lower()]
if not text_cols:
    raise ValueError("‚ùå No cleaned text column (*_no_stopwords) found")

text_col = text_cols[0]

pos_texts = df[df[polarity_col] > 0.1][text_col].dropna()
neg_texts = df[df[polarity_col] < -0.1][text_col].dropna()

def extract_tfidf(texts, sentiment, top_n=30):
    if texts.empty:
        return pd.DataFrame()
    vec = TfidfVectorizer(max_features=top_n)
    X = vec.fit_transform(texts)
    return pd.DataFrame({
        "keyword": vec.get_feature_names_out(),
        "tfidf_score": X.mean(axis=0).A1,
        "sentiment": sentiment
    })

keyword_drivers = pd.concat([
    extract_tfidf(pos_texts, "Positive"),
    extract_tfidf(neg_texts, "Negative")
], ignore_index=True)

keyword_drivers.to_csv(os.path.join(OUT_DIR, "keyword_drivers.csv"), index=False)
print("‚úì keyword_drivers.csv created")

# ================================
# 4Ô∏è‚É£ EMOJI SENTIMENT TABLE
# ================================
emoji_cols = [c for c in df.columns if "emoji" in c.lower()]
if not emoji_cols:
    raise ValueError("‚ùå No emoji column found")

emoji_col = emoji_cols[0]

emoji_expanded = df[[emoji_col, polarity_col]].explode(emoji_col).dropna(subset=[emoji_col])

emoji_sentiment = (
    emoji_expanded.groupby(emoji_col)
    .agg(
        avg_polarity=(polarity_col, "mean"),
        count=(polarity_col, "size")
    )
    .reset_index()
    .rename(columns={emoji_col: "emoji"})
    .sort_values("count", ascending=False)
)

emoji_sentiment.to_csv(os.path.join(OUT_DIR, "emoji_sentiment.csv"), index=False)
print("‚úì emoji_sentiment.csv created")

# ================================
# DONE
# ================================
print("\nüéâ All Power BI Summary Tables Generated Successfully!")
print("üìÅ Files Created:")
print(" - sentiment_summary.csv")
print(" - hashtag_sentiment.csv")
print(" - keyword_drivers.csv")
print(" - emoji_sentiment.csv")


‚úì sentiment_summary.csv created
‚úì hashtag_sentiment.csv created
‚úì keyword_drivers.csv created
‚úì emoji_sentiment.csv created

üéâ All Power BI Summary Tables Generated Successfully!
üìÅ Files Created:
 - sentiment_summary.csv
 - hashtag_sentiment.csv
 - keyword_drivers.csv
 - emoji_sentiment.csv


In [20]:
# Task 15: Filter only negative sentiment records
print('\n' + '=' * 80)
print('TASK 15: FILTER NEGATIVE SENTIMENT RECORDS -> negative_df')
print('=' * 80)

import os

# Ensure polarity column exists
if 'polarity' not in df.columns:
    raise ValueError("Column 'polarity' not found. Run Task 7 to compute sentiment polarity before Task 15.")

# Ensure sentiment_category exists; compute if missing using same thresholds as Task 9
positive_threshold = 0.1
negative_threshold = -0.1
if 'sentiment_category' not in df.columns:
    print("'sentiment_category' not found ‚Äî computing from 'polarity' using thresholds.")
    df['sentiment_category'] = df['polarity'].apply(
        lambda x: 'positive' if x > positive_threshold else ('negative' if x < negative_threshold else 'neutral')
    )

# Filter negatives
negative_df = df[df['sentiment_category'] == 'negative'].copy()

print(f"Negative records: {len(negative_df)} rows")
if len(negative_df) > 0:
    print('\nSample negative rows (first 5):')
    print(negative_df.head(5).to_string(index=False))
else:
    print('No negative records found.')

# Optionally save to CSV for downstream use
out_dir = r"E:/internship project/data/processed"
os.makedirs(out_dir, exist_ok=True)
neg_path = os.path.join(out_dir, 'negative_df.csv')
negative_df.to_csv(neg_path, index=False, encoding='utf-8')
print(f"\nSaved negative_df to: {neg_path}")

print('\n' + '=' * 80)
print('‚úì Task 15 completed: `negative_df` created and exported.')
print('=' * 80)


TASK 15: FILTER NEGATIVE SENTIMENT RECORDS -> negative_df
Negative records: 985 rows

Sample negative rows (first 5):
Post_ID  Post_Date  Platform    Hashtag Content_Type    Region   Views  Likes  Shares  Comments Engagement_Level                                   Text cleaned_text             text_no_stopwords     hashtags  polarity  subjectivity sentiment_category emojis
 Post_3 2022-01-07   Twitter #Challenge        Video    Brazil 3666211 327143   39423     36223           Medium     It failed to meet my expectations.    challenge     failed meet expectations. [#Challenge]     -0.50          0.30           negative     []
 Post_6 2022-11-23 Instagram #Challenge       Shorts Australia 1323566 136282   86979     47129              Low     It failed to meet my expectations.    challenge     failed meet expectations. [#Challenge]     -0.50          0.30           negative     []
Post_13 2022-03-16   YouTube     #Viral       Shorts    Canada 4105651 195560   37627     49089            

In [21]:
# Task 16: Extract negative n-grams (bigrams & trigrams)
print('\n' + '=' * 80)
print('TASK 16: NEGATIVE BIGRAM & TRIGRAM FREQUENCY')
print('=' * 80)

from collections import Counter
from nltk.util import ngrams
import os
import pandas as pd

# Ensure we have a negative dataframe (negative_df) or compute it
if 'negative_df' not in globals():
    print('`negative_df` not found in memory ‚Äî computing from `df`.')
    if 'polarity' not in df.columns:
        raise ValueError("Column 'polarity' not found. Run sentiment Task 7 first.")
    # use same thresholds as Task 9
    pos_thr = 0.1
    neg_thr = -0.1
    if 'sentiment_category' not in df.columns:
        df['sentiment_category'] = df['polarity'].apply(lambda x: 'positive' if x>pos_thr else ('negative' if x<neg_thr else 'neutral'))
    negative_df = df[df['sentiment_category'] == 'negative'].copy()

print(f'Negative rows to process: {len(negative_df)}')

# Pick text column preference (prefer tokenized/no-stopwords)
candidates = ['text_no_stopwords','cleaned_text','text','tweet','content','post','message','caption','body']
text_col = None
for c in candidates:
    if c in negative_df.columns:
        text_col = c
        break

if text_col is None:
    print('‚ö† No suitable text column found in negative_df. Skipping n-gram extraction.')
else:
    print(f'Using `{text_col}` for n-gram extraction')
    texts = negative_df[text_col].fillna('').astype(str).tolist()
    bigram_counter = Counter()
    trigram_counter = Counter()

    for t in texts:
        tokens = t.split()
        if len(tokens) < 2:
            continue
        bigrams = ngrams(tokens, 2)
        trigrams = ngrams(tokens, 3) if len(tokens) >= 3 else []
        bigram_counter.update([' '.join(g) for g in bigrams])
        trigram_counter.update([' '.join(g) for g in trigrams])

    # Create DataFrames and export top N
    top_n = 200
    neg_bigram_df = pd.DataFrame(bigram_counter.most_common(top_n), columns=['bigram','frequency'])
    neg_trigram_df = pd.DataFrame(trigram_counter.most_common(top_n), columns=['trigram','frequency'])

    out_dir = r"E:/internship project/data/processed"
    os.makedirs(out_dir, exist_ok=True)
    bigram_path = os.path.join(out_dir, 'negative_bigrams.csv')
    trigram_path = os.path.join(out_dir, 'negative_trigrams.csv')
    neg_bigram_df.to_csv(bigram_path, index=False, encoding='utf-8')
    neg_trigram_df.to_csv(trigram_path, index=False, encoding='utf-8')

    print('Top 30 Negative Bigrams:')
    for gram, freq in neg_bigram_df.head(30).values:
        print(f'  {gram}: {freq}')

    print('Top 30 Negative Trigrams:')
    for gram, freq in neg_trigram_df.head(30).values:
        print(f'  {gram}: {freq}')

    print(f'\nSaved negative bigrams to: {bigram_path}')
    print(f'Saved negative trigrams to: {trigram_path}')

print('\n' + '=' * 80)
print('‚úì Task 16 completed: negative n-gram frequency tables created and exported.')
print('=' * 80)


TASK 16: NEGATIVE BIGRAM & TRIGRAM FREQUENCY
Negative rows to process: 985
Using `text_no_stopwords` for n-gram extraction
Top 30 Negative Bigrams:
  Totally disappointed: 258
  disappointed outcome.: 258
  failed meet: 250
  meet expectations.: 250
  poor experience: 246
  experience overall.: 246
  satisfied this.: 231
Top 30 Negative Trigrams:
  Totally disappointed outcome.: 258
  failed meet expectations.: 250
  poor experience overall.: 246

Saved negative bigrams to: E:/internship project/data/processed\negative_bigrams.csv
Saved negative trigrams to: E:/internship project/data/processed\negative_trigrams.csv

‚úì Task 16 completed: negative n-gram frequency tables created and exported.


In [5]:
# ============================
# TASK 17: TOPIC MODELING LDA
# ============================

print('\n' + '=' * 80)
print('TASK 17: TOPIC MODELING ON NEGATIVE POSTS')
print('=' * 80)

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# ---------------------------
# NEGATIVE TEXT INPUT
# ---------------------------
negative_texts = [
"I expected more from this product.",
"I regret choosing this.",
"It failed to meet my expectations.",
"Not satisfied at all with this.",
"Such a waste of time.",
"The experience could have been better.",
"This could definitely be improved.",
"Totally disappointed with the outcome.",
"Very poor experience overall.",
"Would not recommend this to others."
]

negative_df = pd.DataFrame({"Text": negative_texts})
print(f"Negative documents: {len(negative_df)}")

# ---------------------------
# LDA PARAMETERS
# ---------------------------
n_topics = 3        # you can change
top_n_words = 8

# ---------------------------
# VECTORIZE TEXT
# ---------------------------
vectorizer = CountVectorizer(stop_words='english')
dtm = vectorizer.fit_transform(negative_df["Text"])
feature_names = vectorizer.get_feature_names_out()

# ---------------------------
# RUN LDA
# ---------------------------
lda = LatentDirichletAllocation(n_components=n_topics,
                                random_state=42)
lda.fit(dtm)

# ---------------------------
# TOPIC KEYWORDS
# ---------------------------
topics = []
for topic_idx, topic in enumerate(lda.components_):
    top_features_ind = topic.argsort()[::-1][:top_n_words]
    top_words = [feature_names[i] for i in top_features_ind]
    topics.append({"topic": f"topic_{topic_idx}",
                   "keywords": " ".join(top_words)})

topics_df = pd.DataFrame(topics)
print("\nTop Topics & Keywords:")
print(topics_df)

# ---------------------------
# ASSIGN TOPIC LABEL TO EACH POST
# ---------------------------
doc_topic_dist = lda.transform(dtm)
doc_topics = doc_topic_dist.argmax(axis=1)
negative_df["topic_label"] = ["topic_" + str(t) for t in doc_topics]

# Optionally save to CSV for downstream use
out_dir = r"E:/internship project/data/processed"
os.makedirs(out_dir, exist_ok=True)
neg_path = os.path.join(out_dir, 'LDA2.csv')
negative_df.to_csv(neg_path, index=False, encoding='utf-8')
print(f"\nSaved negative_df to: {neg_path}")

print("\nNegative Posts with Topic Labels:")
print(negative_df)

print('\n' + '=' * 80)
print('‚úì Task 17 Completed')
print('=' * 80)



TASK 17: TOPIC MODELING ON NEGATIVE POSTS
Negative documents: 10

Top Topics & Keywords:
     topic                                           keywords
0  topic_0  experience poor overall improved definitely sa...
1  topic_1  outcome disappointed totally meet expectations...
2  topic_2  choosing regret waste time product expected be...

Saved negative_df to: E:/internship project/data/processed\LDA2.csv

Negative Posts with Topic Labels:
                                     Text topic_label
0      I expected more from this product.     topic_2
1                 I regret choosing this.     topic_2
2      It failed to meet my expectations.     topic_1
3         Not satisfied at all with this.     topic_0
4                   Such a waste of time.     topic_2
5  The experience could have been better.     topic_2
6      This could definitely be improved.     topic_0
7  Totally disappointed with the outcome.     topic_1
8           Very poor experience overall.     topic_0
9     Would not re

In [8]:
# ============================
# TASK 18: SUMMARIZE TOPIC CLUSTERS
# ============================

print('\n' + '=' * 80)
print('TASK 18: SUMMARIZE TOPIC CLUSTERS')
print('=' * 80)

import pandas as pd
import os

# ---------------------------
# LOAD DATA
# ---------------------------
out_dir = r"E:/internship project/data/processed"
lda_file = os.path.join(out_dir, 'LDA2.csv')

if 'negative_df' not in globals():
    if os.path.exists(lda_file):
        negative_df = pd.read_csv(lda_file)
        print("Loaded LDA2.csv from disk...")
    else:
        raise FileNotFoundError("LDA2.csv not found. Run Task 17 first!")

# ---------------------------
# SAFETY CHECK FOR VARIABLES
# (handles cases when Task 18 runs separately)
# ---------------------------
try:
    n_topics = n_topics
except:
    n_topics = lda.n_components

try:
    top_n_words = top_n_words
except:
    top_n_words = 8

# ---------------------------
# CREATE TOPIC SUMMARY
# ---------------------------
summary_list = []

for topic_idx in range(n_topics):
    topic_name = f"topic_{topic_idx}"

    # Extract top words of this topic
    topic_words_idx = lda.components_[topic_idx].argsort()[::-1][:top_n_words]
    topic_words = [feature_names[i] for i in topic_words_idx]

    # Count how many posts belong to this topic
    count = (negative_df["topic_label"] == topic_name).sum()

    summary_list.append({
        "cluster_name": topic_name,
        "key_terms": ", ".join(topic_words),
        "total_posts": count
    })

topics_summary_df = pd.DataFrame(summary_list)

# ---------------------------
# ADD HUMAN READABLE CLUSTER NAMES
# ---------------------------
cluster_mapping = {
    "topic_0": "Poor Experience / Quality Issue",
    "topic_1": "Expectation Failure / Disappointment",
    "topic_2": "Regret / Not Recommend / Waste of Time"
}

topics_summary_df["cluster_name"] = topics_summary_df["cluster_name"].map(cluster_mapping)

print("\nTopic Cluster Summary")
print(topics_summary_df)

# ---------------------------
# SAVE OUTPUT
# ---------------------------
topics_path = os.path.join(out_dir, 'negative_topics_keywords.csv')
topics_summary_df.to_csv(topics_path, index=False, encoding='utf-8')
print(f"\nSaved topic summary to: {topics_path}")

print('\n' + '=' * 80)
print('‚úì Task 18 Completed')
print('=' * 80)



TASK 18: SUMMARIZE TOPIC CLUSTERS

Topic Cluster Summary
                             cluster_name  \
0         Poor Experience / Quality Issue   
1    Expectation Failure / Disappointment   
2  Regret / Not Recommend / Waste of Time   

                                           key_terms  total_posts  
0  experience, poor, overall, improved, definitel...            3  
1  outcome, disappointed, totally, meet, expectat...            3  
2  choosing, regret, waste, time, product, expect...            4  

Saved topic summary to: E:/internship project/data/processed\negative_topics_keywords.csv

‚úì Task 18 Completed


In [19]:
# ============================
# TASK 19: TOXIC KEYWORD FREQUENCY (OVERALL ONLY)
# ============================

print('\n' + '=' * 80)
print('TASK 19: OVERALL TOXIC KEYWORD FREQUENCY')
print('=' * 80)

import pandas as pd
import os
from collections import Counter

# ---------------------------
# LOAD NEGATIVE DATA
# ---------------------------
out_dir = r"E:/internship project/data/processed"
lda_file = os.path.join(out_dir, 'LDA2.csv')

if 'negative_df' not in globals():
    if os.path.exists(lda_file):
        negative_df = pd.read_csv(lda_file)
        print("Loaded LDA2.csv from disk...")
    else:
        raise FileNotFoundError("Run Task 17 first!")

# ---------------------------
# TOXIC LEXICON
# (you may expand if needed)
# ---------------------------
toxicity_words = [
    "poor","bad","worst","terrible","waste","useless",
    "regret","hate","angry","disappointed","failed",
    "not","recommend"
]

# ---------------------------
# CLEAN TEXT
# ---------------------------
negative_df["clean_text"] = negative_df["Text"].str.lower()

# ---------------------------
# COUNT TOXIC WORDS (GLOBAL)
# ---------------------------
toxic_counter = Counter()

for text in negative_df["clean_text"]:
    for word in toxicity_words:
        if word in text:
            toxic_counter[word] += 1

toxic_df = pd.DataFrame(toxic_counter.items(),
                        columns=["toxic_word","frequency"]
                        ).sort_values(by="frequency",
                                      ascending=False)

print("\nOverall Toxic Keyword Frequency:")
print(toxic_df)

# ---------------------------
# SAVE OUTPUT
# ---------------------------
save_path = os.path.join(out_dir, "toxic_keywords_overall.csv")
toxic_df.to_csv(save_path, index=False)

print(f"\nSaved Overall Toxic Table to: {save_path}")

print('\n' + '=' * 80)
print('‚úì Task 19 Completed')
print('=' * 80)



TASK 19: OVERALL TOXIC KEYWORD FREQUENCY

Overall Toxic Keyword Frequency:
     toxic_word  frequency
2           not          2
0        regret          1
1        failed          1
3         waste          1
4  disappointed          1
5          poor          1
6     recommend          1

Saved Overall Toxic Table to: E:/internship project/data/processed\toxic_keywords_overall.csv

‚úì Task 19 Completed


In [20]:
# ============================
# TASK 20: EXPORT ROOT-CAUSE TABLES
# ============================

print('\n' + '=' * 80)
print('TASK 20: EXPORT ALL ROOT-CAUSE TABLES')
print('=' * 80)

import pandas as pd
import os

out_dir = r"E:/internship project/data/processed"
os.makedirs(out_dir, exist_ok=True)

# ---------------------------
# 1Ô∏è‚É£ Topic Clusters Table (From Task 18)
# ---------------------------
topic_file = os.path.join(out_dir, "negative_topics_keywords.csv")
topic_clusters_path = os.path.join(out_dir, "topic_clusters.csv")

if os.path.exists(topic_file):
    topic_df = pd.read_csv(topic_file)
    topic_df.to_csv(topic_clusters_path, index=False)
    print(f"Exported: {topic_clusters_path}")
else:
    print("Task 18 result not found! Run Task 18 first.")

# ---------------------------
# 2Ô∏è‚É£ Negative Keywords Table
# (We will reuse same topic file keywords)
# ---------------------------
negative_keywords_path = os.path.join(out_dir, "negative_keywords.csv")

if os.path.exists(topic_file):
    negative_keywords_df = topic_df[["cluster_name","key_terms"]]
    negative_keywords_df.to_csv(negative_keywords_path, index=False)
    print(f"Exported: {negative_keywords_path}")
else:
    print("Negative keywords skipped because Task 18 file missing.")

# ---------------------------
# 3Ô∏è‚É£ Toxic Keywords Table (From Task 19)
# ---------------------------
toxic_file = os.path.join(out_dir, "toxic_keywords_overall.csv")
toxic_export_path = os.path.join(out_dir, "toxic_keywords.csv")

if os.path.exists(toxic_file):
    toxic_df = pd.read_csv(toxic_file)
    toxic_df.to_csv(toxic_export_path, index=False)
    print(f"Exported: {toxic_export_path}")
else:
    print("Task 19 toxic output not found! Run Task 19 first.")

print('\n' + '=' * 80)
print('‚úì Task 20 Completed')
print('=' * 80)



TASK 20: EXPORT ALL ROOT-CAUSE TABLES
Exported: E:/internship project/data/processed\topic_clusters.csv
Exported: E:/internship project/data/processed\negative_keywords.csv
Exported: E:/internship project/data/processed\toxic_keywords.csv

‚úì Task 20 Completed
