# Data Formatting, Cleaning and Sizing

In [19]:
import pandas as pd
import sys
import os


# Fixing routing issue
project_root = os.path.abspath('..')
if project_root not in sys.path:
    sys.path.append(project_root)

# Load the dataset
data_path = '../data/raw/twcs/twcs.csv'
df = pd.read_csv(data_path)

# Cuts down to sample size of 10000 entries
df = df.sample(n=10000, random_state=42).reset_index(drop=True)

# Missing Value Check

In [20]:
missing_counts = df.isnull().sum()
missing_counts[missing_counts > 0]

response_tweet_id          3772
in_response_to_tweet_id    2769
dtype: int64

# Category Distribution

In [21]:
if 'category' in df.columns:
    print(df['category'].value_counts())

# Tweet Length Stats and Histogram

In [22]:
df['tweet_length'] = df['text'].astype(str).apply(len)

# Cleaning Tweets

In [23]:
from scripts.preprocess import clean_tweet_text, is_question, get_sentiment_score

# Run the is_question function on data before cleaning
df['is_question'] = df['text'].apply(is_question)

# Run the get_sentiment_score function on data before cleaning
df['sentiment_score'] = df['text'].apply(get_sentiment_score)

# Apply the function to data in text column for sample dataset
df['cleaned_text'] = df['text'].apply(clean_tweet_text)

# Drops empty or null cleaned rows
df = df[df['cleaned_text'].notna()]
df = df[df['cleaned_text'].str.strip() != '']

# Create Cleaned Tweets CSV File

In [24]:
df.to_csv("../data/processed/cleaned_tweets.csv", index=False, encoding='utf-8')
print("Cleaned tweets have been saved to data/processed/cleaned_tweets.csv")

Cleaned tweets have been saved to data/processed/cleaned_tweets.csv
