In [1]:
# ========================================
# 1. Mount Google Drive
# ========================================
from google.colab import drive
drive.mount('/content/drive')

# ========================================


Mounted at /content/drive


In [2]:
# 2. Import Required Libraries
# ========================================
import pandas as pd
import re

# ========================================
# 3. Load the Twitter US Airline Dataset
# ========================================
dataset_path = "/content/drive/MyDrive/Project2/raw data/twitter - us airline.csv"
df_airline = pd.read_csv(dataset_path, low_memory=False)

print("Initial DataFrame Shape:", df_airline.shape)
print("Columns:", df_airline.columns.tolist())
print(df_airline.head())

# ========================================
# 4. Define a Text Cleaning Function
# ========================================
def clean_text(text):
    """
    Basic cleaning: removes extra whitespace, converts to lowercase.
    You can extend this function to remove URLs, punctuation, etc.
    """
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()                  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)     # Normalize whitespace
    return text.strip()

# ========================================



Initial DataFrame Shape: (14640, 15)
Columns: ['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence', 'negativereason', 'negativereason_confidence', 'airline', 'airline_sentiment_gold', 'name', 'negativereason_gold', 'retweet_count', 'text', 'tweet_coord', 'tweet_created', 'tweet_location', 'user_timezone']
             tweet_id airline_sentiment  airline_sentiment_confidence  \
0  570306133677760513           neutral                        1.0000   
1  570301130888122368          positive                        0.3486   
2  570301083672813571           neutral                        0.6837   
3  570301031407624196          negative                        1.0000   
4  570300817074462722          negative                        1.0000   

  negativereason  negativereason_confidence         airline  \
0            NaN                        NaN  Virgin America   
1            NaN                     0.0000  Virgin America   
2            NaN                        NaN  Virgin A

In [3]:
# 5. Clean and Preprocess Key Columns
# ========================================

# A) Text Column (e.g., 'text')
if "text" in df_airline.columns:
    df_airline["text"] = df_airline["text"].fillna("").apply(clean_text)

# B) Convert Datetime Column (e.g., 'tweet_created')
if "tweet_created" in df_airline.columns:
    df_airline["tweet_created"] = pd.to_datetime(df_airline["tweet_created"], errors='coerce')

# C) Convert Numeric Columns (e.g., 'retweet_count', 'airline_sentiment_confidence', 'negativereason_confidence')
numeric_cols = ["retweet_count", "airline_sentiment_confidence", "negativereason_confidence"]
for col in numeric_cols:
    if col in df_airline.columns:
        df_airline[col] = pd.to_numeric(df_airline[col], errors='coerce').fillna(0)

# D) Standardize String Columns (e.g., 'airline', 'name', 'tweet_location', 'user_timezone')
string_cols = ["airline", "name", "tweet_location", "user_timezone"]
for col in string_cols:
    if col in df_airline.columns:
        df_airline[col] = df_airline[col].astype(str).str.strip().str.lower()

# E) Fill Missing Values in Categorical Columns (e.g., 'negativereason')
if "negativereason" in df_airline.columns:
    df_airline["negativereason"] = df_airline["negativereason"].fillna("none").str.lower()

# ========================================
# 6. Inspect the Data After Preprocessing
# ========================================
print("\nDataFrame After Preprocessing:")
print(df_airline.head())
print("\nDataFrame Info:")
print(df_airline.info())



DataFrame After Preprocessing:
             tweet_id airline_sentiment  airline_sentiment_confidence  \
0  570306133677760513           neutral                        1.0000   
1  570301130888122368          positive                        0.3486   
2  570301083672813571           neutral                        0.6837   
3  570301031407624196          negative                        1.0000   
4  570300817074462722          negative                        1.0000   

  negativereason  negativereason_confidence         airline  \
0           none                     0.0000  virgin america   
1           none                     0.0000  virgin america   
2           none                     0.0000  virgin america   
3     bad flight                     0.7033  virgin america   
4     can't tell                     1.0000  virgin america   

  airline_sentiment_gold        name negativereason_gold  retweet_count  \
0                    NaN     cairdin                 NaN              0   


In [4]:
# ========================================
# 7. Save the Cleaned Dataset
# ========================================
cleaned_path = "/content/drive/MyDrive/Project2/clean_data/twitter_us_airline_cleaned.csv"
df_airline.to_csv(cleaned_path, index=False)
print(f"\nCleaned Twitter US Airline dataset saved at: {cleaned_path}")



Cleaned Twitter US Airline dataset saved at: /content/drive/MyDrive/Project2/clean_data/twitter_us_airline_cleaned.csv
