In [None]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
#remove samples with 'other' label and remove datetime column
data = pd.read_csv('data.csv')
df = data.drop('datetime', axis = 1, inplace=True)
desired_label = ['depression', 'neutral']
df = data[data['label'].isin(desired_label)]
df = df.dropna(subset=['content'])
df = df[df['content'].apply(lambda x: isinstance(x, str))]

In [None]:
#text cleaning
def preprocess_text(text):
    # Lowercase conversion
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove Email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove numerics and special characters
    text = re.sub(r'\b\d+\b', '', text)
    text = re.sub(r'\W', ' ', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # White space trimming
    text = ' '.join(tokens).strip()

    return text

In [None]:
df['cleaned_content'] = df['content'].apply(preprocess_text)

In [None]:
#encode label to be 1 and 0
def encode_label(label):
    if label == 'depression':
        return 1
    else:
        return 0

In [None]:
df['encoded_label'] = df['label'].apply(encode_label)

In [None]:
#save to csv files
df['cleaned_content'].to_csv('x.csv', index=False)
df['encoded_label'].to_csv('y.csv', index=False)

In [None]:
#Handle missing values
x = pd.read_csv('x.csv')
y = pd.read_csv('y.csv')
df = pd.concat([x, y], axis = 1)
df = df.dropna()
df['cleaned_content'].to_csv('x.csv', index=False)
df['encoded_label'].to_csv('y.csv', index=False)