In [1]:
# ========================================
# 1. Mount Google Drive
# ========================================
from google.colab import drive
drive.mount('/content/drive')

# ========================================
# 2. Import Required Libraries
# ========================================
import pandas as pd
import re

# ========================================
# 3. Load the Social Media Sentiment Dataset
# ========================================
dataset_path = "/content/drive/MyDrive/Project2/raw data/socialmedia sentiment.csv"
df_social = pd.read_csv(dataset_path, low_memory=False)

print("Initial DataFrame Shape:", df_social.shape)
print("Columns:", df_social.columns.tolist())
print(df_social.head())




Mounted at /content/drive
Initial DataFrame Shape: (732, 15)
Columns: ['Unnamed: 0.1', 'Unnamed: 0', 'Text', 'Sentiment', 'Timestamp', 'User', 'Platform', 'Hashtags', 'Retweets', 'Likes', 'Country', 'Year', 'Month', 'Day', 'Hour']
   Unnamed: 0.1  Unnamed: 0  \
0             0           0   
1             1           1   
2             2           2   
3             3           3   
4             4           4   

                                                Text    Sentiment  \
0   Enjoying a beautiful day at the park!        ...   Positive     
1   Traffic was terrible this morning.           ...   Negative     
2   Just finished an amazing workout! 💪          ...   Positive     
3   Excited about the upcoming weekend getaway!  ...   Positive     
4   Trying out a new recipe for dinner tonight.  ...   Neutral      

             Timestamp            User     Platform  \
0  2023-01-15 12:30:00   User123          Twitter     
1  2023-01-15 08:45:00   CommuterX        Twitter     
2 

In [2]:
# ========================================
# 4. Drop 'Unnamed: 0' Column if Present
# ========================================
if "Unnamed: 0" in df_social.columns:
    df_social.drop(columns=["Unnamed: 0"], inplace=True)

# ========================================
# 5. Define a Text Cleaning Function
# ========================================
def clean_text(text):
    """
    Basic cleaning: removes extra whitespace, converts to lowercase.
    Extend this function as needed for your project (e.g., removing URLs, punctuation).
    """
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()                  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)     # Normalize whitespace
    return text.strip()

# ========================================
# 6. Clean the 'Text' Column
# ========================================
if "Text" in df_social.columns:
    df_social["Text"] = df_social["Text"].fillna("").apply(clean_text)
else:
    print("Column 'Text' not found in the dataset.")

# ========================================
# 7. Convert 'Timestamp' to datetime
# ========================================
if "Timestamp" in df_social.columns:
    df_social["Timestamp"] = pd.to_datetime(df_social["Timestamp"], errors='coerce')

# ========================================
# 8. Standardize Categorical Columns
# ========================================
for col in ["User", "Platform", "Country"]:
    if col in df_social.columns:
        df_social[col] = df_social[col].astype(str).str.strip().str.lower()

# ========================================
# 9. Convert 'Retweets' and 'Likes' to Numeric and Fill Missing Values
# ========================================
for col in ["Retweets", "Likes"]:
    if col in df_social.columns:
        df_social[col] = pd.to_numeric(df_social[col], errors='coerce').fillna(0)

# ========================================
# 10. Inspect the Data After Preprocessing
# ========================================
print("\nDataFrame After Preprocessing:")
print(df_social.head())



DataFrame After Preprocessing:
   Unnamed: 0.1                                         Text    Sentiment  \
0             0        enjoying a beautiful day at the park!   Positive     
1             1           traffic was terrible this morning.   Negative     
2             2          just finished an amazing workout! 💪   Positive     
3             3  excited about the upcoming weekend getaway!   Positive     
4             4  trying out a new recipe for dinner tonight.   Neutral      

            Timestamp        User   Platform  \
0 2023-01-15 12:30:00     user123    twitter   
1 2023-01-15 08:45:00   commuterx    twitter   
2 2023-01-15 15:45:00  fitnessfan  instagram   
3 2023-01-15 18:20:00  adventurex   facebook   
4 2023-01-15 19:55:00    chefcook  instagram   

                                     Hashtags  Retweets  Likes    Country  \
0   #Nature #Park                                  15.0   30.0        usa   
1   #Traffic #Morning                               5.0   10.0

In [4]:
# ========================================
# 11. Save the Cleaned Dataset
# ========================================
cleaned_path = "/content/drive/MyDrive/Project2/clean_data/socialmedia_sentiment_cleaned.csv"
df_social.to_csv(cleaned_path, index=False)
print(f"\nCleaned dataset saved at: {cleaned_path}")



Cleaned dataset saved at: /content/drive/MyDrive/Project2/clean_data/socialmedia_sentiment_cleaned.csv
