In [5]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import fasttext
import nltk
from sklearn.model_selection import train_test_split

In [6]:
# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/pranjakt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pranjakt/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
# Load the Sentiment140 dataset
df = pd.read_csv("sentiment140.csv", encoding='latin-1', header=None)

In [9]:
# Rename columns according to the dataset description
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']

In [10]:
# Preprocess the text
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation and special characters
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [11]:
# Apply preprocessing to the 'text' column
df['cleaned_content'] = df['text'].apply(preprocess_text)

In [13]:
# Format the labels for FastText
def format_labels(label):
    if label == 4:
        return "__label__1"  # Positive
    elif label == 0:
        return "__label__0"  # Negative
    elif label == 2:
        return "__label__2"  # Neutral
    return "__label__-1"  # If any other label, just in case

df['formatted_labels'] = df['target'].apply(format_labels)

In [14]:
# Split the data into training and testing sets (80% train, 20% test)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [15]:
# Save training and test data for FastText
with open("train_sentiment140.txt", "w") as f:
    for label, content in zip(train_df['formatted_labels'], train_df['cleaned_content']):
        f.write(f"{label} {content}\n")

In [16]:
with open("test_sentiment140.txt", "w") as f:
    for label, content in zip(test_df['formatted_labels'], test_df['cleaned_content']):
        f.write(f"{label} {content}\n")

### fasttext with bigrams

In [35]:
# Train the FastText model with bigrams
model1 = fasttext.train_supervised(
    input="train_sentiment140.txt",
    dim=10,           # Set embedding dimension to 10
    wordNgrams=2      # Include bigrams
)

# Save the model
model1.save_model("sentiment140_model1.bin")

# Test the FastText model
result = model.test("test_sentiment140.txt")
print(f"Number of examples: {result[0]}")
print(f"Precision (Accuracy): {result[1]}")
print(f"Recall: {result[2]}")

Read 12M words
Number of words:  677136
Number of labels: 2
Progress: 100.0% words/sec/thread: 5718836 lr:  0.000000 avg.loss:  0.336820 ETA:   0h 0m 0s


Number of examples: 320000
Precision (Accuracy): 0.77985
Recall: 0.77985


### fasttext with optimized parameters like trigrams

In [36]:
# Train the FastText model with optimized parameters 
model2 = fasttext.train_supervised(
    input="train_sentiment140.txt",
    dim=300,          # Higher embedding dimension
    lr=0.1,           # Adjust learning rate
    epoch=25,         # Increase number of epochs
)

# Save the model
model2.save_model("sentiment140_model2.bin")

# Test the FastText model
result = model2.test("test_sentiment140.txt")
print(f"Number of examples: {result[0]}")
print(f"Precision (Accuracy): {result[1]}")
print(f"Recall: {result[2]}")

Read 12M words
Number of words:  677136
Number of labels: 2
Progress: 100.0% words/sec/thread: 2474867 lr:  0.000000 avg.loss:  0.363379 ETA:   0h 0m 0s


Number of examples: 320000
Precision (Accuracy): 0.769878125
Recall: 0.769878125
