In [1]:
import fasttext
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

# Load your training and testing datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Function for text preprocessing
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Create a copy of the original text for later use
train_data['original_text'] = train_data['text']
test_data['original_text'] = test_data['text']

# Preprocess the text data
train_data['text'] = train_data['text'].apply(preprocess_text)
test_data['text'] = test_data['text'].apply(preprocess_text)

# Remove stop words
stop_words = set(stopwords.words('english'))
train_data['text'] = train_data['text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))
test_data['text'] = test_data['text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

# Optional: Label Mapping
label_mapping = {
    0: 'Bad', 
    1: 'Bad', 
    2: 'Okay', 
    3: 'Good', 
    4: 'Good'
}

# Map labels if needed
train_data['label'] = train_data['label'].map(label_mapping)
test_data['label'] = test_data['label'].map(label_mapping)

# Function to format data for fastText
def format_data(df):
    formatted_data = []
    for _, row in df.iterrows():
        formatted_data.append(f"__label__{row['label']} {row['text']}")
    return "\n".join(formatted_data)

# Format the training and test data
train_text = format_data(train_data)
test_text = format_data(test_data)

# Save formatted data to text files
with open("train.txt", "w") as f:
    f.write(train_text)

with open("test.txt", "w") as f:
    f.write(test_text)

# Train the FastText model with hierarchical softmax
model = fasttext.train_supervised(input="train.txt", epoch=20, lr=0.5, wordNgrams=2, dim=10, loss='hs')

# Evaluate the model on the test dataset
result = model.test("test.txt")

# Print performance metrics
print(f"Number of examples: {result[0]}")
print(f"Precision: {result[1]}")

# Save the model for future use
model.save_model("model_fasttext.bin")


Number of examples: 50000
Precision: 0.76742


In [2]:
# Show examples from the test dataset with their predicted sentiments
sample_size = 5  # Number of samples to show
sample_data = test_data.sample(sample_size)  # Randomly sample from the test set

# Prepare a list to store results
results = []

for index, row in sample_data.iterrows():
    predicted_label = model.predict(row['text'])[0][0].replace("__label__", "")  # Predict on preprocessed text
    results.append((row['original_text'], predicted_label))  # Keep original text for output

# Print results
for original_text, predicted_sentiment in results:
    print(f"Original Text: {original_text}\nPredicted Sentiment: {predicted_sentiment}\n")


Original Text: 1 star - just because I love the NYC location and the frozen hot chocolate is no other\n\n1 star - because our waiter was very helpful\n\n1 start - the food was tasty\n\nI should really give minus 10 stars for our moody, rude hostess..  Ok look, my coworker isn't that concious of customer service and yes he gave you the glare down - but you were taking FOREVER to seat us.  She said that we were \"next\" and then seated 4 other couples before us. Not to mention that the table we ended up sitting at was empty for no joke 15 minutes before we were sat there.  She obviously was doing it on purpose and the owners should fire her.  \n\nIt's your job to put on a smile and give the customers great service and she failed.  As for the food, my coworker got a foot long chili hot dog and liked it.  I got the spaghetti and meatballs and the meatballs are made with veal, pork and beef and are huge.  Very tasty.  Thank you to our waiter for his great service.
Predicted Sentiment: Okay


In [3]:
# Load your training and testing datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Function for text preprocessing
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Create a copy of the original text for later use
train_data['original_text'] = train_data['text']
test_data['original_text'] = test_data['text']

# Preprocess the text data
train_data['text'] = train_data['text'].apply(preprocess_text)
test_data['text'] = test_data['text'].apply(preprocess_text)

# Remove stop words
stop_words = set(stopwords.words('english'))
train_data['text'] = train_data['text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))
test_data['text'] = test_data['text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

# Optional: Label Mapping
label_mapping = {
    0: 'Bad', 
    1: 'Bad', 
    2: 'Okay', 
    3: 'Good', 
    4: 'Good'
}

# Map labels if needed
train_data['label'] = train_data['label'].map(label_mapping)
test_data['label'] = test_data['label'].map(label_mapping)

# Function to format data for fastText
def format_data(df):
    formatted_data = []
    for _, row in df.iterrows():
        formatted_data.append(f"__label__{row['label']} {row['text']}")
    return "\n".join(formatted_data)

# Format the training and test data
train_text = format_data(train_data)
test_text = format_data(test_data)

# Save formatted data to text files
with open("train.txt", "w") as f:
    f.write(train_text)

with open("test.txt", "w") as f:
    f.write(test_text)

# Train the FastText model with hierarchical softmax
model = fasttext.train_supervised(input="train.txt", epoch=20, lr=0.5, wordNgrams=1, dim=10, loss='hs')

# Evaluate the model on the test dataset
result = model.test("test.txt")

# Print performance metrics
print(f"Number of examples: {result[0]}")
print(f"Precision: {result[1]}")

# Save the model for future use
model.save_model("model_fasttext.bin")


Number of examples: 50000
Precision: 0.7664


In [4]:
# Show examples from the test dataset with their predicted sentiments
sample_size = 5  # Number of samples to show
sample_data = test_data.sample(sample_size)  # Randomly sample from the test set

# Prepare a list to store results
results = []

for index, row in sample_data.iterrows():
    predicted_label = model.predict(row['text'])[0][0].replace("__label__", "")  # Predict on preprocessed text
    results.append((row['original_text'], predicted_label))  # Keep original text for output

# Print results
for original_text, predicted_sentiment in results:
    print(f"Original Text: {original_text}\nPredicted Sentiment: {predicted_sentiment}\n")

Original Text: Think this was another instance where raving yelp reviews set my expectations too high. I think its a cute place, I have no problem w it being in a strip mall. I think its really odd to have a bar AND a bartender when you only serve beer and wine. Our server was nice, but nothing exceptional. We were waiting for another person to join us, so after getting our bread and drinks we were basically ignored until she showed up. I tried to wave down the waiter a couple times to order an appetizer but he just flew by. \nThe freshly baked bread was great but like someone else mentioned, it was in sections and some were toasted to sh-t on the outside - basically big croutons. We started w the mussels which were just ok, definitely not worth $15 for 6 mussels w panko on top. It was an excessive wait until we got our salads. I went w the Caesar - awesome dressing w a kick to it.  I got the rigatoni w meatball. Huge portion of pasta and a big meatball for an additional $6. Great dish