In [2]:
import pandas as pd 

In [3]:
data = [
    "When life gives you lemons, make lemonade! 🙂",
    "She bought 2 lemons for $1 at Maven Market.",
    "A dozen lemons will make a gallon of lemonade. [AllRecipes]",
    "lemon, lemon, lemons, lemon, lemon, lemons",
    "He's running to the market to get a lemon — there's a great sale today.",
    "Does Maven Market carry Eureka lemons or Meyer lemons?",
    "An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]",
    "iced tea is my favorite"
]

In [4]:

# Convert list to DataFrame

data_df = pd.DataFrame(data, columns=['sentence'])


In [5]:

# Set display options to show full content

pd.set_option('display.max_colwidth', None)


In [6]:

# Create a copy for spaCy processing

spacy_df = data_df.copy()



# Convert text to lowercase

spacy_df['clean_sentence'] = spacy_df['sentence'].str.lower()


In [7]:

# Remove specific citations

spacy_df['clean_sentence'] = spacy_df['clean_sentence'].str.replace('[wikipedia]', '')



# Advanced cleaning with regex

combined = r'https?://\S+|www\.\S+|<.*?>|\S+@\S+\.\S+|@\w+|#\w+|[^A-Za-z0-9\s]'

spacy_df['clean_sentence'] = spacy_df['clean_sentence'].str.replace(combined, ' ', regex=True)

spacy_df['clean_sentence'] = spacy_df['clean_sentence'].str.replace(r'\s+', ' ', regex=True).str.strip()


  spacy_df['clean_sentence'] = spacy_df['clean_sentence'].str.replace('[wikipedia]', '')


In [9]:

import spacy



# Download and install English language model

!python -m spacy download en_core_web_sm



# Load the pre-trained pipeline

nlp = spacy.load('en_core_web_sm')



# Process a sample sentence

phrase = spacy_df.clean_sentence[0] # "when life gives you lemons make lemonade"

doc = nlp(phrase)


ModuleNotFoundError: No module named 'spacy'

In [None]:

# Extract tokens as text strings

[token.text for token in doc]

# Output: ['when', 'life', 'gives', 'you', 'lemons', 'make', 'lemonade']



# Extract tokens as spaCy objects (with linguistic attributes)

[token for token in doc]

# Output: [when, life, gives, you, lemons, make, lemonade]


In [None]:

# Extract lemmatized forms

[token.lemma_ for token in doc]

# Output: ['when', 'life', 'give', 'you', 'lemon', 'make', 'lemonade']


In [None]:

# View all English stop words in spaCy

list(nlp.Defaults.stop_words)

print(f"Total stop words: {len(list(nlp.Defaults.stop_words))}") # 326 stop words



# Remove stop words

[token for token in doc if  not token.is_stop]

# Output: [life, gives, lemons, lemonade]



# Combine lemmatization and stop word removal

[token.lemma_ for token in doc if  not token.is_stop]

# Output: ['life', 'give', 'lemon', 'lemonade']



# Convert back to sentence format

norm = [token.lemma_ for token in doc if  not token.is_stop]

' '.join(norm) # Output: 'life give lemon lemonade'


In [None]:

# Function for lemmatization and stop word removal

def  token_lemma_stopw(text):

doc = nlp(text)

output = [token.lemma_ for token in doc if  not token.is_stop]

return  ' '.join(output)



# Apply to entire dataset

spacy_df.clean_sentence.apply(token_lemma_stopw)


In [None]:

def  lower_replace(series):

output = series.str.lower()

combined = r'https?://\S+|www\.\S+|<.*?>|\S+@\S+\.\S+|@\w+|#\w+|[^A-Za-z0-9\s]'

output = output.str.replace(combined, ' ', regex=True)

return output



def  nlp_pipeline(series):

output = lower_replace(series)

output = output.apply(token_lemma_stopw)

return output



# Apply complete pipeline

cleaned_text = nlp_pipeline(data_df.sentence)



# Save processed data for future use

pd.to_pickle(cleaned_text, 'preprocessed_text.pkl')


In [None]:

# Load preprocessed data

import pandas as pd

series = pd.read_pickle('preprocessed_text.pkl')



from sklearn.feature_extraction.text import CountVectorizer



# Create Count Vectorizer

cv = CountVectorizer()

bow = cv.fit_transform(series)



# Convert to DataFrame for visualization

pd.DataFrame(bow.toarray(), columns=cv.get_feature_names_out())


In [None]:

# Count Vectorizer with filtering

cv1 = CountVectorizer(

stop_words='english', # Remove English stop words

ngram_range=(1,1), # Use only single words (unigrams)

min_df=2  # Include words that appear in at least 2 documents

)



bow1 = cv1.fit_transform(series)

bow1_df = pd.DataFrame(bow1.toarray(), columns=cv1.get_feature_names_out())



# Calculate term frequencies

term_freq = bow1_df.sum()


In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer



# Basic TF-IDF vectorization

tv = TfidfVectorizer()

tvidf = tv.fit_transform(series)

tvidf_df = pd.DataFrame(tvidf.toarray(), columns=tv.get_feature_names_out())



# TF-IDF with filtering

tv1 = TfidfVectorizer(min_df=2) # Words must appear in at least 2 documents

tvidf1 = tv1.fit_transform(series)

tvidf1_df = pd.DataFrame(tvidf1.toarray(), columns=tv1.get_feature_names_out())


In [None]:

# Bigram TF-IDF (pairs of consecutive words)

tv2 = TfidfVectorizer(ngram_range=(1,2)) # Include both unigrams and bigrams

tvidf2 = tv2.fit_transform(series)

tvidf2_df = pd.DataFrame(tvidf2.toarray(), columns=tv2.get_feature_names_out())



# Analyze feature importance

tvidf2_df.sum().sort_values(ascending=False)
