In [1]:
import pickle
import re

# nltk
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from tqdm import tqdm

In [2]:
# Get the dataset here: https://www.kaggle.com/datasets/kazan...
DATASET_COLUMNS = ["sentiment", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
dataset = pd.read_csv(
    "data/training.1600000.processed.noemoticon.csv",
    encoding=DATASET_ENCODING,
    names=DATASET_COLUMNS,
)

# Removing unnecessary columns
dataset = dataset[["sentiment", "text"]]
# Replacing the values to ease understanding
dataset["sentiment"] = dataset["sentiment"].replace(4, 1)

# Storing data in lists
text, sentiment = list(dataset["text"]), list(dataset["sentiment"])

In [3]:
# Defining dictionary containing all emojis with their meanings.
emojis = {
    ":)": "smile",
    ":-)": "smile",
    ";d": "wink",
    ":-E": "vampire",
    ":(": "sad",
    ":-(": "sad",
    ":-<": "sad",
    ":P": "raspberry",
    ":O": "surprised",
    ":-@": "shocked",
    ":@": "shocked",
    ":-$": "confused",
    ":\\": "annoyed",
    ":#": "mute",
    ":X": "mute",
    ":^)": "smile",
    ":-&": "confused",
    "$_$": "greedy",
    "@@": "eyeroll",
    ":-!": "confused",
    ":-D": "smile",
    ":-0": "yell",
    "O.o": "confused",
    "<(-_-)>": "robot",
    "d[-_-]b": "dj",
    ":'-)": "sadsmile",
    ";)": "wink",
    ";-)": "wink",
    "O:-)": "angel",
    "O*-)": "angel",
    "(:-D": "gossip",
    "=^.^=": "cat",
}

# Defining set containing all stopwords in english.
stopwords_manual = [
    "a",
    "about",
    "above",
    "after",
    "again",
    "ain",
    "all",
    "am",
    "an",
    "and",
    "any",
    "are",
    "as",
    "at",
    "be",
    "because",
    "been",
    "before",
    "being",
    "below",
    "between",
    "both",
    "by",
    "can",
    "d",
    "did",
    "do",
    "does",
    "doing",
    "down",
    "during",
    "each",
    "few",
    "for",
    "from",
    "further",
    "had",
    "has",
    "have",
    "having",
    "he",
    "her",
    "here",
    "hers",
    "herself",
    "him",
    "himself",
    "his",
    "how",
    "i",
    "if",
    "in",
    "into",
    "is",
    "it",
    "its",
    "itself",
    "just",
    "ll",
    "m",
    "ma",
    "me",
    "more",
    "most",
    "my",
    "myself",
    "now",
    "o",
    "of",
    "on",
    "once",
    "only",
    "or",
    "other",
    "our",
    "ours",
    "ourselves",
    "out",
    "own",
    "re",
    "s",
    "same",
    "she",
    "shes",
    "should",
    "shouldve",
    "so",
    "some",
    "such",
    "t",
    "than",
    "that",
    "thatll",
    "the",
    "their",
    "theirs",
    "them",
    "themselves",
    "then",
    "there",
    "these",
    "they",
    "this",
    "those",
    "through",
    "to",
    "too",
    "under",
    "until",
    "up",
    "ve",
    "very",
    "was",
    "we",
    "were",
    "what",
    "when",
    "where",
    "which",
    "while",
    "who",
    "whom",
    "why",
    "will",
    "with",
    "won",
    "y",
    "you",
    "youd",
    "youll",
    "youre",
    "youve",
    "your",
    "yours",
    "yourself",
    "yourselves",
]


print("emojis:", len(emojis))
print("stopwords (manually defined):", len(stopwords_manual))

emojis: 32
stopwords (manually defined): 136


In [4]:
# nltk.download("stopwords")  # uncomment to download
stopwords_nltk = stopwords.words("english")

print("stopwords (nltk):", len(stopwords_nltk))

stopwords (nltk): 179


In [5]:
all_stopwords = list(set(stopwords_manual + stopwords_nltk))


print("stopwords (nltk + manually defined):", len(all_stopwords))

stopwords (nltk + manually defined): 186


In [6]:
nltk.download("omw-1.4")
nltk.download("wordnet")

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/wilsvenleong/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/wilsvenleong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
lemmatizer = WordNetLemmatizer()
# grouping together the inflected forms ("better" -> "good")


def preprocess(text_data: list[str]) -> list[str]:
    processed_text = []

    # Defining regex patterns
    url_pattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    user_pattern = r"@[^\s]+"
    alpha_pattern = r"[^a-zA-Z0-9]"
    sequence_pattern = r"(.)\1\1+"
    seq_replace_pattern = r"\1\1"

    for tweet in tqdm(text_data):
        tweet = tweet.lower()

        # Replace all URls with 'URL'
        tweet = re.sub(url_pattern, " URL", tweet)
        # Replace all emojis
        for emoji in emojis:
            tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])
        # Replace @USERNAME to 'USER'
        tweet = re.sub(user_pattern, " USER", tweet)
        # Replace all non alphabets
        tweet = re.sub(alpha_pattern, " ", tweet)
        # Replace 3 or more consecutive letters by 2 letter
        tweet = re.sub(sequence_pattern, seq_replace_pattern, tweet)

        preprocessed_words = []
        for word in tweet.split():
            # Check if the word is a stopword
            if len(word) > 1 and word not in all_stopwords:
                # Lemmatizing the word
                word = lemmatizer.lemmatize(word)
                preprocessed_words.append(word)

        processed_text.append(" ".join(preprocessed_words))

    return processed_text

In [8]:
processed_text = preprocess(text)

# Display first five tweets
processed_text[:5]

100%|██████████| 1600000/1600000 [01:10<00:00, 22710.96it/s]


['USER URL aww bummer shoulda got david carr third day EMOJIwink',
 'upset update facebook texting might cry result school today also blah',
 'USER dived many time ball managed save 50 rest go bound',
 'whole body feel itchy like fire',
 'USER behaving mad see']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    processed_text, sentiment, test_size=0.05, random_state=0
)

In [10]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=500000)

In [11]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [12]:
def model_evaluate(model):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

In [13]:
bnb = BernoulliNB(alpha=2)
bnb.fit(X_train, y_train)
model_evaluate(bnb)

              precision    recall  f1-score   support

           0       0.79      0.77      0.78     39989
           1       0.78      0.80      0.79     40011

    accuracy                           0.78     80000
   macro avg       0.78      0.78      0.78     80000
weighted avg       0.78      0.78      0.78     80000



In [14]:
svc = LinearSVC(dual="auto")
svc.fit(X_train, y_train)
model_evaluate(svc)

              precision    recall  f1-score   support

           0       0.79      0.77      0.78     39989
           1       0.78      0.80      0.79     40011

    accuracy                           0.78     80000
   macro avg       0.78      0.78      0.78     80000
weighted avg       0.78      0.78      0.78     80000



In [15]:
lr = LogisticRegression(C=2, max_iter=1000, n_jobs=-1)
lr.fit(X_train, y_train)
model_evaluate(lr)

              precision    recall  f1-score   support

           0       0.81      0.78      0.79     39989
           1       0.79      0.81      0.80     40011

    accuracy                           0.80     80000
   macro avg       0.80      0.80      0.80     80000
weighted avg       0.80      0.80      0.80     80000



In [16]:
# Pipeline!!
from sklearn.pipeline import Pipeline


X_train, X_test, y_train, y_test = train_test_split(
    processed_text, sentiment, test_size=0.05, random_state=0
)

pipe = Pipeline([("vectorizer", vectorizer), ("bnb", bnb)])
pipe.fit(X_train, y_train)

model_evaluate(pipe)

              precision    recall  f1-score   support

           0       0.79      0.77      0.78     39989
           1       0.78      0.80      0.79     40011

    accuracy                           0.78     80000
   macro avg       0.78      0.78      0.78     80000
weighted avg       0.78      0.78      0.78     80000



In [17]:
# Save
with open("api/models/pipeline.pkl", "wb") as f:
    pickle.dump(pipe, f)

# Load
with open("api/models/pipeline.pkl", "rb") as f:
    loaded_pipe = pickle.load(f)

model_evaluate(loaded_pipe)

              precision    recall  f1-score   support

           0       0.79      0.77      0.78     39989
           1       0.78      0.80      0.79     40011

    accuracy                           0.78     80000
   macro avg       0.78      0.78      0.78     80000
weighted avg       0.78      0.78      0.78     80000



In [18]:
def predict(model, text):
    # Predict the sentiment
    preprocessed_text = preprocess(text)
    predictions = model.predict(preprocessed_text)

    pred_to_label = {0: "Negative", 1: "Positive"}

    # Make a list of text with sentiment.
    data = []
    for t, pred in zip(text, predictions):
        data.append((t, pred, pred_to_label[pred]))

    return data


if __name__ == "__main__":
    # Text to classify should be in a list.
    text = [
        "I hate twitter",
        "May the Force be with you.",
        "Mr. Stark, I don't feel so good",
    ]

    predictions = predict(loaded_pipe, text)
    print(predictions)

100%|██████████| 3/3 [00:00<00:00, 228.54it/s]

[('I hate twitter', 0, 'Negative'), ('May the Force be with you.', 1, 'Positive'), ("Mr. Stark, I don't feel so good", 1, 'Positive')]



