In [1]:
# ========================================
# 1. Mount Google Drive
# ========================================
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [2]:
# ========================================
# 2. Import Required Libraries
# ========================================
import pandas as pd
import re

# ========================================
# 3. Load the Twitter Training Dataset (No Header)
# ========================================
dataset_path = "/content/drive/MyDrive/Project2/raw data/twitter_training.csv"
column_names = ["ID", "Topic", "Sentiment", "Text"]

df_twitter_train = pd.read_csv(
    dataset_path,
    header=None,       # No header in the CSV
    names=column_names,
    low_memory=False
)

print("Initial DataFrame Shape:", df_twitter_train.shape)
print("Columns:", df_twitter_train.columns.tolist())
print(df_twitter_train.head())

# ========================================
# 4. Define a Text Cleaning Function
# ========================================
def clean_text(text):
    """
    Basic cleaning: removes extra whitespace, converts to lowercase.
    You can extend this function to remove URLs, punctuation, etc.
    """
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()                  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)     # Normalize whitespace
    return text.strip()

# ========================================


Initial DataFrame Shape: (74682, 4)
Columns: ['ID', 'Topic', 'Sentiment', 'Text']
     ID        Topic Sentiment  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

                                                Text  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  


In [3]:
# 5. Clean and Preprocess Key Columns
# ========================================

# A) Text Column
if "Text" in df_twitter_train.columns:
    df_twitter_train["Text"] = df_twitter_train["Text"].fillna("").apply(clean_text)

# B) Convert Numeric Columns (e.g., 'ID')
if "ID" in df_twitter_train.columns:
    df_twitter_train["ID"] = pd.to_numeric(df_twitter_train["ID"], errors='coerce').fillna(0).astype(int)

# C) Standardize String Columns (e.g., 'Topic', 'Sentiment')
string_cols = ["Topic", "Sentiment"]
for col in string_cols:
    if col in df_twitter_train.columns:
        df_twitter_train[col] = df_twitter_train[col].astype(str).str.strip().str.lower()

# ========================================
# 6. Inspect the Data After Preprocessing
# ========================================
print("\nDataFrame After Preprocessing:")
print(df_twitter_train.head())
print("\nDataFrame Info:")
print(df_twitter_train.info())

# ========================================



DataFrame After Preprocessing:
     ID        Topic Sentiment  \
0  2401  borderlands  positive   
1  2401  borderlands  positive   
2  2401  borderlands  positive   
3  2401  borderlands  positive   
4  2401  borderlands  positive   

                                                Text  
0  im getting on borderlands and i will murder yo...  
1  i am coming to the borders and i will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         74682 non-null  int64 
 1   Topic      74682 non-null  object
 2   Sentiment  74682 non-null  object
 3   Text       74682 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB
None


In [4]:
# 7. Save the Cleaned Dataset
# ========================================
cleaned_path = "/content/drive/MyDrive/Project2/clean_data/twitter_training_cleaned.csv"
df_twitter_train.to_csv(cleaned_path, index=False)
print(f"\nCleaned Twitter Training dataset saved at: {cleaned_path}")



Cleaned Twitter Training dataset saved at: /content/drive/MyDrive/Project2/clean_data/twitter_training_cleaned.csv
