In [34]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv("sampled_data.csv")

# Filter the DataFrame to include only the specified columns
df = df[["title", "description", "features", "main_category"]]

# Display the first few rows of the filtered DataFrame
print(df.head())

                                               title  \
0  Easyouth Sew in Weft Hair Extensions Human Hai...   
1  NICPOO No Heat Rollers Hair Curlers for Long H...   
2  Cherry Vanilla Float GFHS 2020 Gentle Foaming ...   
3  Anna Belen Girls"Lila" Large Grosgrain Bow Cli...   
4  Snsowed 2 Pack Professional Curved Vented Styl...   

                                         description features main_category  
0                                                 []       []    All Beauty  
1                                                 []       []    All Beauty  
2  ['gentle foaming hand soap with cherry vanilla...       []    All Beauty  
3                                                 []       []    All Beauty  
4                                                 []       []    All Beauty  


In [35]:
df.shape[0]

8000

In [36]:
df.dtypes

title            object
description      object
features         object
main_category    object
dtype: object

In [37]:
# Convert 'description' from object to string
df['description'] = df['description'].astype('string')
df['title'] = df['title'].astype('string')
df['features'] = df['features'].astype('string')

In [38]:
# Remove the brackets and join the content inside the lists
df['description'] = df['description'].str.strip("[]").str.replace("'", "")
df['title'] = df['title'].str.strip("[]").str.replace("'", "")
df['features'] = df['features'].str.strip("[]").str.replace("'", "")

In [39]:
df['description'].head()

0                                                     
1                                                     
2    gentle foaming hand soap with cherry vanilla s...
3                                                     
4                                                     
Name: description, dtype: string

In [40]:
# Remove rows where the 'description' column is empty or contains only whitespace
df = df[df['description'].str.strip() != ""]

In [41]:
# Filter out rows where 'title' is not empty
df = df[(df['features'] != "")]

In [42]:
df.shape[0]

2456

In [43]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

# NLTK downloads:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Map NLTK POS tags to WordNet POS tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun

def clean_text_column_with_lemmatization(df, column_name):
    """
    Cleans text data in a specified column of a DataFrame, including lemmatization with POS tagging.
    
    Steps:
    - Convert to lowercase
    - Remove URLs
    - Remove HTML tags
    - Remove punctuation
    - Remove numbers
    - Remove stopwords
    - Tokenize text
    - Perform POS tagging and lemmatize tokens
    
    Args:
    - df (pd.DataFrame): The DataFrame containing the text column.
    - column_name (str): The name of the column to clean.
    
    Returns:
    - pd.DataFrame: DataFrame with cleaned text column.
    """
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    def clean_text(text):
        # Convert to lowercase
        text = text.lower()
        # Remove URLs
        text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
        # Remove HTML tags
        text = re.sub(r"<.*?>", "", text)
        # Remove punctuation
        text = text.translate(str.maketrans("", "", string.punctuation))
        # Remove numbers
        text = re.sub(r"\d+", "", text)
        # Tokenize text
        tokens = word_tokenize(text)
        # Remove stopwords
        tokens = [word for word in tokens if word not in stop_words]
        # POS tagging
        pos_tags = pos_tag(tokens)
        # Debug: Print POS tags for verification
        #print(f"Original text: {text}")
        #print(f"POS Tags: {pos_tags}")
        # Lemmatize tokens with POS tagging
        tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
        # Debug: Print lemmatized tokens
        #print(f"Lemmatized tokens: {tokens}")
        # Join tokens back into a string
        text = " ".join(tokens)
        return text
    
    # Apply cleaning to the specified column
    df[column_name] = df[column_name].astype(str).apply(clean_text)
    return df

[nltk_data] Downloading package stopwords to /Users/prers/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/prers/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/prers/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/prers/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [44]:
df['description'].head()

8     The Perfect Woman Toning & Firming Complex is ...
11    Large butterfly clamps which is perfect for se...
18    The new ultimate29 premium professional are to...
23    Grand Parfums Perfume Oils, Body and Incense o...
29    "Design Essentials Natural coconut & Monoid de...
Name: description, dtype: string

In [45]:
df['features'].head()

8     100% Natural advanced enzyme technology, Lift ...
11    Perfect for separating long or short hair, 12 ...
18    This professional line offers a great mixture ...
23    FIRST QUALITY SCENT & AROMA - PHTHALATE FREE: ...
29    Provides maximum hydration for all day moistur...
Name: features, dtype: string

In [46]:
import pandas as pd

# Example: Count empty values in 'title' and 'features' columns
empty_title_count = df['title'].isna().sum()
empty_features_count = df['features'].isna().sum()

print(f"Number of empty values in 'title': {empty_title_count}")
print(f"Number of empty values in 'features': {empty_features_count}")

Number of empty values in 'title': 0
Number of empty values in 'features': 0


In [47]:
# Combine the columns
df['content'] = df['title'] + " | " + df['features'] + " | " + df['description']

In [48]:
# Clean the 'text' column
df = clean_text_column_with_lemmatization(df, 'content')

In [49]:
df['content'].head()

8     bottle perfect woman tone firm complex recomme...
11    softn style large butterfly clamp count perfec...
18    x u dove grey bleach safe salon towel pack pro...
23    grand parfums premium burn fragrance oil egypt...
29    design essential natural coconut monoi deep mo...
Name: content, dtype: object

In [50]:
df.to_csv('cleaned_data.csv')