# Instagram Comments Cleaner
This notebook cleans the scraped Instagram comments data for semantic analysis

In [None]:
import pandas as pd
import re

# Load the scraped Instagram comments
dataframe = pd.read_csv('instagram_comments.csv')

In [None]:
# Initial analysis - see what we're working with
print("Original dataset info:")
print(f"Total comments: {len(dataframe)}")
print(f"Columns: {list(dataframe.columns)}")
print("\nFirst few rows:")
dataframe.head()

In [None]:
# Show detailed statistics before cleaning
print("Before cleaning:")
dataframe.describe()

In [None]:
# Show sample of text content to understand what needs cleaning
print("Sample comment texts:")
for i, text in enumerate(dataframe['text'].head(10)):
    print(f"{i+1}. {text}")

In [None]:
# Start cleaning process
print("🧹 Starting Instagram comment cleaning...")

# 1. Remove comments with empty or null text
initial_count = len(dataframe)
dataframe = dataframe[dataframe['text'].notna()]
dataframe = dataframe[dataframe['text'].str.strip() != '']
print(f"Removed {initial_count - len(dataframe)} empty/null comments")

# 2. Remove comments that are just emojis or single characters
emoji_pattern = r'^[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF\U00002600-\U000027BF\U0001f900-\U0001f9ff\U0001f600-\U0001f64f\U0001f300-\U0001f5ff\U0001f680-\U0001f6ff\U0001f1e0-\U0001f1ff\s]*$'
before_emoji = len(dataframe)
dataframe = dataframe[~dataframe['text'].str.match(emoji_pattern, na=False)]
print(f"Removed {before_emoji - len(dataframe)} emoji-only comments")

# 3. Remove very short comments (less than 3 characters)
before_short = len(dataframe)
dataframe = dataframe[dataframe['text'].str.len() >= 3]
print(f"Removed {before_short - len(dataframe)} very short comments")

In [None]:
# 4. Remove common spam patterns
spam_patterns = [
    r'^(dm me|dm|follow me|follow back|f4f|l4l|like4like|follow4follow)$',
    r'^(check my.*|visit my.*|link in bio|see my profile).*',
    r'^(buy.*|sale.*|discount.*|promo.*|offer.*)$',
    r'^(@\w+\s*)+$',  # Comments that are just mentions
    r'^(first|second|third|1st|2nd|3rd)$',  # Just ordinal numbers
    r'^\.\.\.$',  # Just dots
]

before_spam = len(dataframe)
for pattern in spam_patterns:
    dataframe = dataframe[~dataframe['text'].str.match(pattern, case=False, na=False)]
print(f"Removed {before_spam - len(dataframe)} spam-like comments")

# 5. Remove comments from potential bot accounts (optional - be careful with this)
# Common bot username patterns
bot_patterns = r'(bot|spam|fake|auto|promo|sale|buy|shop).*\d+$'
before_bots = len(dataframe)
dataframe = dataframe[~dataframe['username'].str.match(bot_patterns, case=False, na=False)]
print(f"Removed {before_bots - len(dataframe)} potential bot comments")

In [None]:
# 6. Remove duplicate comments (same text from same user)
before_duplicates = len(dataframe)
dataframe = dataframe.drop_duplicates(subset=['username', 'text'], keep='first')
print(f"Removed {before_duplicates - len(dataframe)} duplicate comments")

# 7. Remove comments that are just URLs or links
before_links = len(dataframe)
url_pattern = r'^https?://.*$'
dataframe = dataframe[~dataframe['text'].str.match(url_pattern, na=False)]
print(f"Removed {before_links - len(dataframe)} URL-only comments")

# 8. Optional: Remove comments with excessive repeated characters (like "sooooo good")
def has_excessive_repeats(text):
    # Check if any character repeats more than 4 times consecutively
    return bool(re.search(r'(.)\1{4,}', str(text)))

before_repeats = len(dataframe)
dataframe = dataframe[~dataframe['text'].apply(has_excessive_repeats)]
print(f"Removed {before_repeats - len(dataframe)} comments with excessive character repetition")

In [None]:
# Clean up text content (normalize without removing)
def clean_text(text):
    if pd.isna(text):
        return text
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    # Remove leading/trailing whitespace
    text = text.strip()
    
    return text

dataframe['text'] = dataframe['text'].apply(clean_text)
print("✅ Normalized text content (removed extra whitespace)")

In [None]:
# Final cleanup - remove any rows that became empty after text cleaning
before_final = len(dataframe)
dataframe = dataframe[dataframe['text'].str.len() > 0]
print(f"Final cleanup: removed {before_final - len(dataframe)} empty entries")

# Reset index
dataframe.reset_index(drop=True, inplace=True)

In [None]:
# Show cleaning results
print("\n📊 CLEANING SUMMARY:")
print(f"✅ Final dataset: {len(dataframe)} comments")
print(f"📉 Removed: {initial_count - len(dataframe)} comments ({((initial_count - len(dataframe)) / initial_count * 100):.1f}%)")
print("\nAfter cleaning:")
dataframe.describe()

In [None]:
# Show sample of cleaned comments
print("\n🔍 Sample of cleaned comments:")
for i, row in dataframe.head(10).iterrows():
    print(f"{i+1}. @{row['username']}: {row['text'][:100]}{'...' if len(row['text']) > 100 else ''}")

In [None]:
# Check for any potential issues in cleaned data
print("\n🔍 Quality check:")
print(f"Comments with very long text (>500 chars): {len(dataframe[dataframe['text'].str.len() > 500])}")
print(f"Unique usernames: {dataframe['username'].nunique()}")
print(f"Average comment length: {dataframe['text'].str.len().mean():.1f} characters")
print(f"Comments with mentions (@): {len(dataframe[dataframe['text'].str.contains('@', na=False)])}")
print(f"Comments with hashtags (#): {len(dataframe[dataframe['text'].str.contains('#', na=False)])}")

In [None]:
# Save the cleaned dataset
dataframe.to_csv('cleaned_instagram_comments.csv', index=False)
print("💾 Saved cleaned comments to 'cleaned_instagram_comments.csv')")