In [33]:
!pip install nltk pandas numpy scikit-learn beautifulsoup4 --quiet

import pandas as pd
import numpy as np
import nltk
import re
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder

nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:


print("\n==================== TASK 1: SOCIAL MEDIA CLEANING ====================")

# Upload dataset in Google Colab
#from google.colab import files
#uploaded = files.upload()

social_df = pd.read_csv("/content/social_media.csv")

# ----- BEFORE SUMMARY -----
print("\n--- BEFORE CLEANING ---")
print(social_df.head())

# -------- REMOVE PUNCTUATION, STOPWORDS, SPECIAL SYMBOLS ----------
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)       # remove punctuation/symbols
    words = [w for w in text.split() if w not in stop_words]
    return " ".join(words)

social_df["clean_post"] = social_df["post_text"].apply(clean_text)

# -------- HANDLE MISSING VALUES IN likes & shares ----------
social_df["likes"] = social_df["likes"].fillna(social_df["likes"].median())
social_df["shares"] = social_df["shares"].fillna(0)

# -------- TIMESTAMP PROCESSING ----------
social_df["timestamp"] = pd.to_datetime(social_df["timestamp"])
social_df["hour"] = social_df["timestamp"].dt.hour
social_df["weekday"] = social_df["timestamp"].dt.day_name()

# -------- REMOVE DUPLICATE / SPAM ----------
social_df.drop_duplicates(subset="clean_post", inplace=True)

# ----- AFTER SUMMARY -----
print("\n--- AFTER CLEANING ---")
print(social_df.head())

# -------- TEST CASES ----------
assert social_df["clean_post"].isna().sum() == 0
assert social_df["likes"].isna().sum() == 0
assert "hour" in social_df.columns

print("\nTask 1 Passed All Tests ✔")



--- BEFORE CLEANING ---
   post_id    user                      post_text  likes  shares  \
0        1  user_1  This is a sample POST!!! #fun   20.0     1.0   
1        2  user_2        <html>Great Day!</html>   20.0     3.0   
2        3  user_3  This is a sample POST!!! #fun   20.0     1.0   
3        4  user_4        <html>Great Day!</html>  100.0     NaN   
4        5  user_5  This is a sample POST!!! #fun   20.0     5.0   

             timestamp  
0  2025-01-01 00:00:00  
1  2025-01-01 06:00:00  
2  2025-01-01 12:00:00  
3  2025-01-01 18:00:00  
4  2025-01-02 00:00:00  

--- AFTER CLEANING ---
   post_id    user                      post_text  likes  shares  \
0        1  user_1  This is a sample POST!!! #fun   20.0     1.0   
1        2  user_2        <html>Great Day!</html>   20.0     3.0   

            timestamp         clean_post  hour    weekday  
0 2025-01-01 00:00:00    sample post fun     0  Wednesday  
1 2025-01-01 06:00:00  htmlgreat dayhtml     6  Wednesday  

Task 