In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import minimize
import re
import nltk
from nltk.tokenize import word_tokenize
import contractions

nltk.download('punkt_tab')

url = './kaggle_sentiment_data.csv'
data = pd.read_csv(url)

# Remove the first column
data = data.drop(data.columns[0], axis=1)
data = data.dropna(subset=['statement', 'status'])

print(data.head())
processed_data = data[["statement", "status"]]

print(processed_data.head())

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/kaylee_bae/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


                                           statement   status
0                                         oh my gosh  Anxiety
1  trouble sleeping, confused mind, restless hear...  Anxiety
2  All wrong, back off dear, forward doubt. Stay ...  Anxiety
3  I've shifted my focus to something else but I'...  Anxiety
4  I'm restless and restless, it's been a month n...  Anxiety
                                           statement   status
0                                         oh my gosh  Anxiety
1  trouble sleeping, confused mind, restless hear...  Anxiety
2  All wrong, back off dear, forward doubt. Stay ...  Anxiety
3  I've shifted my focus to something else but I'...  Anxiety
4  I'm restless and restless, it's been a month n...  Anxiety


In [4]:
import re
import nltk
import pandas as pd
import contractions
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Download necessary NLTK resources
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Ensure all entries in "statement" are strings
processed_data["statement"] = processed_data["statement"].astype(str)

# Define a regex pattern to match URLs
url_pattern = re.compile(r'https?://\S+')

# Define a function to clean text
def clean_text(text):
    # Expand contractions
    text = contractions.fix(text)
    # Remove URLs
    text = url_pattern.sub('', text)
    # Remove non-word and non-whitespace characters
    text = re.sub(r'[^\w\s]', '', text)
    # Remove digits
    text = re.sub(r'\d', '', text)
    return text.lower()

# Define function to lemmatize tokens
def lemmatize_tokens(tokens):
    # Convert POS tag to WordNet format
    def get_wordnet_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)
    
    # Lemmatize tokens
    lemmas = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    return lemmas

# Apply the cleaning function and tokenize in one go
processed_data["statement"] = processed_data["statement"].apply(clean_text).apply(nltk.word_tokenize)

# Apply lemmatization function to the tokenized column
processed_data["statement"] = processed_data["statement"].apply(lemmatize_tokens)

processed_data.to_csv('processed_data.csv', index=False)

print(processed_data.head())


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kaylee_bae/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kaylee_bae/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kaylee_bae/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/kaylee_bae/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


                                           statement   status
0                                     [oh, my, gosh]  Anxiety
1  [trouble, sleep, confuse, mind, restless, hear...  Anxiety
2  [wrong, back, off, dear, forward, doubt, stay,...  Anxiety
3  [i, have, shift, my, focus, to, something, els...  Anxiety
4  [i, restless, and, restless, it, be, be, a, mo...  Anxiety
