# Introducion
This script processes raw tweet data to prepare it for analysis by cleaning and standardizing the text. It removes unwanted elements such as hashtags, mentions, links, and non-English words, while also converting text to lowercase and eliminating duplicates. Using NLTK, it filters out stopwords and applies stemming to reduce words to their root forms for more effective analysis. The cleaned tweets are then organized into a Pandas DataFrame, with unnecessary columns removed, and exported as a CSV file (all_data.csv).


In [2]:
import json
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

# download NLTK 的 stopwords data
nltk.download('stopwords')


stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Function to clean text
def clean_text(text):
    # 1. Remove extra spaces
    text = text.strip()
    
    # 2. Remove newline characters and extra spaces
    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace characters (including newline, tabs) with a single space
    
    # 3. Remove meaningless characters, such as special symbols (can be adjusted as needed)
    text = re.sub(r'[^\w\s,.\-]', '', text)  # Keep letters, numbers, spaces, commas, periods, and hyphens
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove links
    text = re.sub(r'@\S+', '', text)  # Remove @username mentions
    
    # 4. Optional: Convert the text to lowercase
    text = text.lower()
    text = text.replace('"', '')  # Remove double quotes

    if text.startswith('"') and text.endswith('"'):
        text = text[1:-1]

    # 5. If you want to remove tags (e.g., # or @), you can add this step
    text = re.sub(r'#[\w]+', '', text)  # Remove all hashtags (e.g., #aiart)
    text = re.sub(r'@\S+', '', text)    # Remove all usernames (e.g., @ai_ethics)
    text = re.sub(r'-', '', text) 
    return text

def remove_non_english_words(text):
    # English words
    english_words = re.findall(r'\b[a-zA-Z]+\b', text)
    return ' '.join(english_words)

def process_text(text):
    # Tokenize the text into words
    words = text.split()
    # Remove stop words and apply stemming
    processed_words = [stemmer.stem(word) for word in words if word not in stop_words]
    # Join the processed words back into a string
    return ' '.join(processed_words)

# Open and read the original data file
data = []
with open('../data/data-raw/raw_tweets.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Clean the text of each tweet
for tweet in data:
    tweet['text'] = clean_text(tweet['text'])          # Clean the 'text' field
    tweet['text'] = remove_non_english_words(tweet['text'])  # Keep only English words
    tweet['text'] = process_text(tweet['text'])        # Remove stop words and apply stemming

# Use pandas to convert the data into a DataFrame
df = pd.DataFrame(data)

# Drop the 'created_at' column
df = df.drop(columns=['created_at'])

# Drop duplicate rows based on the 'text' column
df = df.drop_duplicates(subset=['text'])

# Output the DataFrame to a CSV file
df.to_csv('all_data.csv', index=False, encoding='utf-8')

print("Data cleaned and written to all_data.csv")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xunlei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df = pd.read_csv('train.csv')

df = df[df['comment_text'].notna() & (df['comment_text'] != '')]

df = df.fillna(0)
df['comment_text'] = df['comment_text'].astype(str)


df['comment_text'] = df['comment_text'].apply(clean_text)           
df['comment_text'] = df['comment_text'].apply(remove_non_english_words)  
df['comment_text'] = df['comment_text'].apply(process_text)


df.to_csv('cleaned_train.csv', index=False, encoding='utf-8')

print("Data cleaned and written to cleaned_train.csv")


Data cleaned and written to cleaned_train.csv
