In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
from scipy.stats import ttest_ind
import spacy

import re

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [4]:
# Loading the dataset
df = pd.read_csv('../data/clean/df_0.csv', sep=',')

# Data Preprocessing with spaCy

**Clean the text**

In [8]:
# Preprocess the dataset: removing URLs, mentions and hashtags im 'text' column and convert to lowercase

# Define a function to clean the text
def clean_text(text):
    # Remove URLs, mentions, hashtags, and punctuation
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'#\w+', '', text)     # Remove hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.lower()  # Convert to lowercase

# Apply the text cleaning function
df['cleaned_text'] = df['text'].apply(clean_text)

# Inspect the cleaned text column
df[['text', 'cleaned_text']].head()

Unnamed: 0,text,cleaned_text
0,"Cooking microwave pizzas, yummy",cooking microwave pizzas yummy
1,Any plans of allowing sub tasks to show up in ...,any plans of allowing sub tasks to show up in ...
2,"I love the humor, I just reworded it. Like sa...",i love the humor i just reworded it like sayi...
3,naw idk what ur talkin about,naw idk what ur talkin about
4,That sucks to hear. I hate days like that,that sucks to hear i hate days like that


**Tokenization and Lemmatization with spaCy**
 
 This step will break the text into individual words (tokens) and convert each word to its base form (lemma)

In [9]:
# Load spaCy's small English model
nlp = spacy.load('en_core_web_sm')

# Function to tokenize and lemmatize text
def tokenize_and_lemmatize(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_stop]  # Lemmatize and remove stopwords

# Apply the tokenization and lemmatization function
df['tokens'] = df['cleaned_text'].apply(tokenize_and_lemmatize)

# Inspect the tokenized and lemmatized text
print(df[['cleaned_text', 'tokens']].head())

                                        cleaned_text  \
0                     cooking microwave pizzas yummy   
1  any plans of allowing sub tasks to show up in ...   
2   i love the humor i just reworded it like sayi...   
3                       naw idk what ur talkin about   
4           that sucks to hear i hate days like that   

                                              tokens  
0                    [cook, microwave, pizza, yummy]  
1                   [plan, allow, sub, task, widget]  
2  [ , love, humor, reword, like, say, group, the...  
3                          [ , naw, idk, ur, talkin]  
4                   [ , suck, hear, hate, day, like]  


In [10]:
# Save the cleaned and tokenized data to a new CSV file
df.to_csv('preprocessed_data.csv', index=False)