In [2]:
import pandas as pd
import numpy as np
import spacy

### Data

In [4]:
# Load data

shoes_df = pd.read_csv('/Users/pin.lyu/Documents/BC_Folder/NLP/Data/Sneaker_Buy_Suggestions_WSJ.txt',  sep="\t", encoding="mac_roman")

In [5]:
# Data Dimension

shoes_df.shape

(13, 1)

In [6]:
# Display dataframe 

shoes_df

Unnamed: 0,"The Best Fashion Sneakers to Upgrade Your Wardrobe, According to Stylists"
0,"Step up your style with these luxe lace-ups, s..."
1,Written By Karen I. Chen Edited By Madeline Di...
2,Gone are the days of sneakers as footwear sole...
3,The Samba OGs have been around for decades. In...
4,"Washington, D.C.-based fashion stylist Naina S..."
5,A classic white low-top never goes out of styl...
6,“If you're a fan of classic white sneakers but...
7,If you're looking for a chunkier look to contr...
8,Another pair that Silberstein reports is the “...
9,The Reebok Club C 85s are another vintage-insp...


### Data Processing

In [8]:
# Rename the column to 'text' 

shoes_df.columns = ['text']

In [9]:
# Load the SpaCy pre-trained model (small)

nlp = spacy.load("en_core_web_sm")

# Function to extract named entities

def extract_entities(text):
    
    doc = nlp(text)

    # Dict for specific entity types
    
    entities = {"PERSON": [], "ORG": [], "PRODUCT": [], "MONEY": []}  # Individual, Organization/Company, Product
    
    for ent in doc.ents:
        
        if ent.label_ == "PERSON":
            
            entities["PERSON"].append(ent.text)
            
        elif ent.label_ == "ORG":
            
            entities["ORG"].append(ent.text)
            
        elif ent.label_ == "PRODUCT":
            
            entities["PRODUCT"].append(ent.text)

        elif ent.label_ == "MONEY":
            
            entities["MONEY"].append(ent.text)
            
    return entities

#### Initialize Dataframes For Different Model Sizes

In [11]:
# Initialize an empty DataFrame

shoes_df_sm_model = pd.DataFrame()

shoes_df_md_model = pd.DataFrame()

shoes_df_lg_model = pd.DataFrame()

# Copy the 'text' column from shoes_df

shoes_df_sm_model['text'] = shoes_df['text'].copy()

shoes_df_md_model['text'] = shoes_df['text'].copy()

shoes_df_lg_model['text'] = shoes_df['text'].copy()

### Models
#### Small Model

In [13]:
# Apply entity extraction function (small model)

shoes_df_sm_model['entities'] = shoes_df_sm_model['text'].apply(extract_entities)

#Expand the entities into separate columns for each type

entities_df = pd.json_normalize(shoes_df_sm_model['entities'])

# Combine the original 'text' with the extracted entities

result_df = pd.concat([entities_df], axis=1)

result_df

Unnamed: 0,PERSON,ORG,PRODUCT,MONEY
0,[],[],[],[]
1,"[Karen I. Chen, Madeline Diamond]",[],[],[]
2,[Lifestyle],[],[],[]
3,"[Caroline Baudino, Sambas, Sambas, Baudino, Sa...",[],[],[100]
4,"[Naina Singla, Sambas, Baudino, Handball Spezi...",[Original Handball Spezials],[],[110]
5,[Cariuma OCA Lows],[Cariuma],[],[89]
6,"[Singla, Singla]",[Super Team Suede Sneakers],[],[110]
7,"[Baudino, Raina Silberstein]",[],[530s],[100]
8,[],[the New Balance],[],[150]
9,[Patrick Kenger],"[Reebok Club, Reebok]",[],[90]


#### Medium Model

In [15]:
# Load the SpaCy pre-trained model (medium model)

nlp = spacy.load("en_core_web_md")

# Apply new model

shoes_df_md_model['entities'] = shoes_df_md_model['text'].apply(extract_entities)

# Expand the entities into separate columns for each type

entities_df = pd.json_normalize(shoes_df_md_model['entities'])

# Combine the original 'text' with the extracted entities

result_df = pd.concat([entities_df], axis=1)

result_df

Unnamed: 0,PERSON,ORG,PRODUCT,MONEY
0,[],[],[],[]
1,"[Karen I. Chen, Madeline Diamond]",[],[],[]
2,[],[],[],[]
3,"[Caroline Baudino, Sambas, Baudino]",[Samba OG Shoes],[],[100]
4,"[Naina Singla, Adidas, Sambas, Baudino, Spezia...",[],[],[110]
5,"[Singla, Singla]",[Cariuma],[],[89]
6,"[Singla, Singla]",[],[],[110]
7,"[Baudino, Raina Silberstein]",[the unisex New Balance],[],[100]
8,[],[Silberstein],[],[150]
9,[Patrick Kenger],[Reebok],[],[90]


#### Large Model

In [17]:
# Load the SpaCy pre-trained model (small)

nlp = spacy.load("en_core_web_lg")

# Apply new model

shoes_df_lg_model['entities'] = shoes_df_lg_model['text'].apply(extract_entities)

# Expand the entities into separate columns for each type

entities_df = pd.json_normalize(shoes_df_lg_model['entities'])

# Combine the original 'text' with the extracted entities

result_df = pd.concat([entities_df], axis=1)

result_df

Unnamed: 0,PERSON,ORG,PRODUCT,MONEY
0,[],[],[],[]
1,"[Karen I. Chen, Madeline Diamond]",[],[],[]
2,[],[],[],[]
3,"[Caroline Baudino, Sambas, Sambas, Baudino]","[Samba OG Shoes, Adidas]",[],[100]
4,"[Naina Singla, Sambas, Baudino]","[Adidas, Original Handball Spezials, Handball ...",[],[110]
5,"[Singla, Cariuma]",[],[],[89]
6,"[Singla, Singla]","[Puma Super Teams, Super Team Suede Sneakers, ...",[],[110]
7,"[Baudino, Raina Silberstein]",[],[530s],[100]
8,[Silberstein],[],[],"[9060 Sneakers, 150]"
9,[Patrick Kenger],"[Reebok Club, EVA, Reebok]",[],[90]


Among the different model sizes tested, the medium-sized model performed best for my chosen task. Compared to the small model, it was significantly better at recognizing companies without mistakenly categorizing product names under "ORG." It also excelled at identifying stylists mentioned in the article for purchase suggestions, though it occasionally misclassified code names and product names as stylists.  

While the large model showed some improvement in recognizing stylist names, it introduced errors by misplacing product names in the "MONEY" column and adding redundant information to the company column. These mistakes outweighed its slight advantage in name recognition. Given this tradeoff, the medium-sized model provided the best balance of accuracy and reliability.