In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd

# Ensure NLTK resources are downloaded
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
try:
    WordNetLemmatizer()
except LookupError:
    nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()  # Lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Remove URLs
    text = re.sub(r'\S*@\S*\s?', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words] # Remove stopwords and lemmatize
    return ' '.join(tokens)

# Load the dataset (assuming 'social_media.csv' is in the same directory)
social_media_df = pd.read_csv('social_media.csv')

# Task 1.1: Remove stopwords, punctuation, and special symbols from 'post_text'
# Assuming the text column is named 'post_text'
if 'post_text' in social_media_df.columns:
    social_media_df['cleaned_post_text'] = social_media_df['post_text'].apply(clean_text)
else:
    print("Warning: 'post_text' column not found. Skipping text cleaning.")

# Task 1.2: Handle missing values in 'likes' and 'shares' columns
# Assuming 'likes' and 'shares' columns exist and are numeric
for col in ['likes', 'shares']:
    if col in social_media_df.columns:
        social_media_df[col] = pd.to_numeric(social_media_df[col], errors='coerce').fillna(0).astype(int)
    else:
        print(f"Warning: '{col}' column not found. Skipping missing value handling for '{col}'.")

# Task 1.3: Convert timestamp to datetime and extract features
# Assuming 'timestamp' column exists
if 'timestamp' in social_media_df.columns:
    social_media_df['timestamp'] = pd.to_datetime(social_media_df['timestamp'], errors='coerce')
    social_media_df['hour'] = social_media_df['timestamp'].dt.hour
    social_media_df['weekday'] = social_media_df['timestamp'].dt.day_name()
    # Drop original timestamp if desired, or keep both
    # social_media_df = social_media_df.drop('timestamp', axis=1)
else:
    print("Warning: 'timestamp' column not found. Skipping datetime conversion and feature extraction.")

# Task 1.4: Detect and remove spam/duplicate posts
# Remove exact duplicates based on the cleaned text to handle content-based duplicates
if 'cleaned_post_text' in social_media_df.columns:
    initial_rows = social_media_df.shape[0]
    social_media_df.drop_duplicates(subset=['cleaned_post_text'], inplace=True)
    print(f"Removed {initial_rows - social_media_df.shape[0]} duplicate posts.")
else:
    print("Warning: 'cleaned_post_text' column not found. Skipping duplicate post removal based on text.")

# Display the cleaned dataset head and info
social_media_df.head()
social_media_df.info()

Removed 18 duplicate posts.
<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, 0 to 1
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   post_id            2 non-null      int64         
 1   user               2 non-null      object        
 2   post_text          2 non-null      object        
 3   likes              2 non-null      int64         
 4   shares             2 non-null      int64         
 5   timestamp          2 non-null      datetime64[ns]
 6   cleaned_post_text  2 non-null      object        
 7   hour               2 non-null      int32         
 8   weekday            2 non-null      object        
dtypes: datetime64[ns](1), int32(1), int64(3), object(4)
memory usage: 152.0+ bytes
