In [1]:
# ============================================================
# TASK 1 : SOCIAL MEDIA DATA CLEANING (GOOGLE COLAB VERSION)
# ============================================================

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from google.colab import files

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

print("\n================= TASK 1: SOCIAL MEDIA CLEANING =================\n")

# ---------------- UPLOAD DATASET ----------------
print("Upload your CSV file")
uploaded = files.upload()

file_name = list(uploaded.keys())[0]
social_df = pd.read_csv(file_name)

# ---------------- BEFORE CLEANING ----------------
print("\n--- BEFORE CLEANING ---")
print(social_df.head())

# ---------------- TEXT CLEANING FUNCTION ----------------
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)      # remove punctuation & symbols
    words = [w for w in text.split() if w not in stop_words]
    return " ".join(words)

# Apply text cleaning
social_df["clean_post"] = social_df["post_text"].apply(clean_text)

# ---------------- HANDLE MISSING VALUES ----------------
social_df["likes"] = social_df["likes"].fillna(social_df["likes"].median())
social_df["shares"] = social_df["shares"].fillna(social_df["shares"].median())

# ---------------- TIMESTAMP PROCESSING ----------------
social_df["timestamp"] = pd.to_datetime(social_df["timestamp"], errors="coerce")
social_df["hour"] = social_df["timestamp"].dt.hour
social_df["weekday"] = social_df["timestamp"].dt.day_name()

# ---------------- REMOVE DUPLICATES / SPAM ----------------
social_df.drop_duplicates(subset=["clean_post"], inplace=True)

# ---------------- AFTER CLEANING ----------------
print("\n--- AFTER CLEANING ---")
print(social_df.head())

# ---------------- SAVE CLEANED DATASET ----------------
social_df.to_csv("cleaned_social_media.csv", index=False)
files.download("cleaned_social_media.csv")

# ---------------- TEST CASES ----------------
assert social_df["clean_post"].isna().sum() == 0
assert social_df["likes"].isna().sum() == 0
assert "hour" in social_df.columns

print("\n Task 1 Passed All Tests")




Upload your CSV file


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Saving social_media (1).csv to social_media (1) (1).csv

--- BEFORE CLEANING ---
   post_id    user                      post_text  likes  shares  \
0        1  user_1  This is a sample POST!!! #fun   20.0     1.0   
1        2  user_2        <html>Great Day!</html>   20.0     3.0   
2        3  user_3  This is a sample POST!!! #fun   20.0     1.0   
3        4  user_4        <html>Great Day!</html>  100.0     NaN   
4        5  user_5  This is a sample POST!!! #fun   20.0     5.0   

             timestamp  
0  2025-01-01 00:00:00  
1  2025-01-01 06:00:00  
2  2025-01-01 12:00:00  
3  2025-01-01 18:00:00  
4  2025-01-02 00:00:00  

--- AFTER CLEANING ---
   post_id    user                      post_text  likes  shares  \
0        1  user_1  This is a sample POST!!! #fun   20.0     1.0   
1        2  user_2        <html>Great Day!</html>   20.0     3.0   

            timestamp         clean_post  hour    weekday  
0 2025-01-01 00:00:00    sample post fun     0  Wednesday  
1 2025-01-0

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


 Task 1 Passed All Tests
