In [40]:
import numpy as np
import re
import nltk
from nltk import download

from nltk.corpus import twitter_samples
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tag import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score


download(['twitter_samples'])

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [41]:
# prepare set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# putting all tweets together
all_tweets = all_positive_tweets + all_negative_tweets

In [42]:
print(len(all_positive_tweets))
print(len(all_negative_tweets))

5000
5000


In [43]:
# so classes are balanced

In [44]:
# creating y (labels)

# assigning 1 to positive tweets
# assigning 0 to nagative tweets
y = [ 1 for _ in range(len(all_positive_tweets)) ] + [ 0 for _ in range(len(all_negative_tweets)) ]



In [45]:
# creating X (texts)

In [46]:
# step1 : preprocessing the tweets

In [47]:
!pip install contractions



In [48]:
# expand contractions (don't -> do not)

import contractions

def expand_contractions(text):
    expanded_text = contractions.fix(text)
    return expanded_text


In [49]:
# this function replaces online slang abbreviations that are used on social media with their original expansion

def replace_abbreviations(tweet, slang_dict):
    # Split the tweet into words
    words = tweet.split()

    # Replace abbreviations with their meanings
    replaced_tweet = [
        slang_dict[word] if word in slang_dict else word
        for word in words
    ]

    # Join the words back into a single string
    return ' '.join(replaced_tweet)



In [50]:

slang_dict = {
    "lol": "laugh out loud",
    "brb": "be right back",
    "gtg": "got to go",
    "ttyl": "talk to you later",
    "omg": "oh my god",
    "idk": "i don’t know",
    "bff": "best friends forever",
    "fyi": "for your information",
    "tmi": "too much information",
    "smh": "shaking my head",
    "imo": "in my opinion",
    "imho": "in my humble opinion",
    "lmao": "laughing my ass off",
    "yolo": "you only live once",
    "fomo": "fear of missing out",
    "dm": "direct message",
    "nsfw": "not safe for work",
    "tldr": "too long; didn’t read",
    "bae": "before anyone else",
    "wtf": "what the f***",
    "ppl": "people",
    "bffl": "best friends for life",
    "jk": "just kidding",
    "tbt": "throwback thursday",
    "rofl": "rolling on the floor laughing",
    "wbu": "what about you?",
    "hbu": "how about you?",
    "imo": "in my opinion",
    "icymi": "in case you missed it",
    "bffae": "best friends forever and ever",
    "dms": "direct messages",
    "sfw": "safe for work",
    "fomo": "fear of missing out",
    "yoyo": "you're on your own",
    "l8r": "later",
    "cya": "see you",
    "xoxo": "hugs and kisses",
    "wyd": "what you doing?",
    "nbd": "no big deal",
    "btt": "back to topic",
    "b4": "before",
    "tldr": "too long; didn't read",
    "fml": "f*** my life",
    "iirc": "if i recall correctly",
    "dm": "direct message",
    "smdh": "shaking my damn head",
    "bruh": "bro (used to express disbelief)",
    "sus": "suspicious",
    "vibes": "good feelings or atmosphere",
    "k": "okay",
    "bae": "before anyone else",
    "h8": "hate",
    "tbf": "to be fair",
    "qotd": "quote of the day",
    "ootd": "outfit of the day",
    "rip": "rest in peace",
    "bop": "a good song",
    "fomo": "fear of missing out",
    "simp": "someone who shows excessive sympathy",
    "lit": "exciting or excellent",
    "goat": "greatest of all time",
    "binge": "to consume excessively",
    "clt": "can't live without",
    "dnd": "do not disturb",
    "litaf": "lit as f***",
    "nvm": "never mind",
    "so": "shout out",
    "bff": "best friend forever",
    "tmi": "too much information",
    "lqtm": "laughing quietly to myself",
    "fyp": "for you page (tiktok)",
    "wya": "where you at?",
    "slay": "to succeed or look great",
    "hml": "hit me up",
    "wis": "what i said",
    "pov": "point of view",
    "rip": "rest in peace",
    "ngl": "not gonna lie",
    "ymmv": "your mileage may vary",
    "cba": "can’t be arsed",
    "hth": "hope this helps",
    "tldr": "too long; didn't read",
    "pita": "pain in the a**",
    "vaf": "very annoying friend",
    "wth": "what the heck",
    "kms": "kill myself (used humorously)",
    "mfw": "my face when",
    "fubar": "f***ed up beyond all recognition",
    "snafu": "situation normal: all f***ed up",
    "tba": "to be announced",
    "eta": "estimated time of arrival",
    "hbd": "happy birthday",
    "lqtm": "laughing quietly to myself",
    "rsvp": "répondez s'il vous plaît (please respond)",
    "grwm": "get ready with me",
    "sis": "sister (used as a term of endearment)",
    "fam": "family or close friends",
    "g2g": "got to go",
    "wbu": "what about you?",
    "fomo": "fear of missing out",
    "goat": "greatest of all time",
    "cya": "see you",
    "so": "shout out",
    "fomo": "fear of missing out",
    "hbu": "how about you?",
    "tldr": "too long; didn't read",
    "simp": "someone who shows excessive sympathy",
    "lfg": "looking for group (gaming)",
    "nbd": "no big deal",
    "bff": "best friends forever",
    "ppl": "people",
    "lfg": "looking for group",
    "sis": "sister (used informally)",
    "rip": "rest in peace",
    "fomo": "fear of missing out",
    "wya": "where you at?",
    "btw": "by the way",
    "ftw": "for the win",
    "tldr": "too long; didn't read",
    "bda": "big deal alert",
    "srs": "serious",
    "fomo": "fear of missing out",
    "mia": "missing in action",
    "bff": "best friends forever",
    "bffl": "best friends for life",
    "cya": "see you",
    "dm": "direct message",
    "mfw": "my face when",
    "tmi": "too much information",
    "sop": "standard operating procedure",
    "nbd": "no big deal",
    "so": "shout out",
    "p2p": "peer to peer",
    "fomo": "fear of missing out",
    "yolo": "you only live once",
    "tldr": "too long; didn't read",
    "accnt" : "account",
    "rqst" : "request",
    "fb" : "facebook",
    "lwwf" : "love wins we fight",
    ":)": "smile",
    ":(": "frown",
    ":D": "laughing",
    ":P": "playful or teasing",
    ":O": "surprised",
    ":|": "neutral",
    ":S": "confused",
    ";)": "wink",
    ":/": "unsure or skeptical",
    ":*": "kiss",
    "B)": "cool",
    ":'(": "crying",
    "XD": "laughing hard",
    "^-^": "happy",
    "^_^": "smiling",
    "o_O": "bewildered",
    ">_<": "frustrated",
    "<3": "heart",
    "</3": "broken heart",
    ">:(": "angry",
    "T_T": "crying",
    ":-)": "smile",
    ":-(": "frown",
    ":-D": "laughing",
    ":-P": "playful or teasing",
    ":-O": "surprised",
    ":-|": "neutral",
    ":-S": "confused",
    ";-)": "wink",
    ":-/": "unsure or skeptical",
    ":-*": "kiss",
    "B-)": "cool",
    ":'-(": "crying",
    "X-D": "laughing hard",
    "^-^": "happy",
    "^_^": "smiling",
    "o_O": "bewildered",
    ">_<": "frustrated",
    "<3": "heart",
    "</3": "broken heart",
    ">:(" : "angry",
    "T_T": "crying",
    "O_O": "surprised",
    "0_o": "confused",
    ":-X": "sealed lips",
    ":3": "cute",
    ":v": "peace",
    ":-]": "happy",
    ":-[": "sad",
    ":?": "questioning",
    ":-c": "disappointed",
    ":-b": "playful",
    "D:": "shocked",
    "D8": "disgusted",
    "X_x": "dead or tired",
    ":-@": "screaming",
    "O.O": "shocked",
    ":-)": "happy",
    ":-(": "sad",
    ":-/": "confused",
    ":-|": "meh",
    ":-*": "kiss",
    ":^)": "smirking",
    ":<": "sad",
    ":-]": "happy",
    ":-<": "sad",
    ":}": "smirk",
    ":)": "smile",
    ":(": "frown",
    ":D": "grin",
    ":P": "teasing",
    ";)": "wink",
    ":|": "neutral",
    ":o": "surprise",
    ":S": "confusion",
    ":X": "sealed lips",
}

In [52]:
# normalizing

def normalizer(text):

    # # Convert to lowercase
    # normalized_text = text.lower()

    # # Remove links, tags, and hashtags using regular expressions
    # normalized_text = re.sub(r"http\S+|www\S+|https\S+|@\w+|#\w+", "", normalized_text)

    # Remove special characters and punctuations
    normalized_text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove extra whitespaces
    normalized_text = re.sub(r'\s+', ' ', normalized_text).strip()

    return normalized_text



In [53]:
# checking the normalizer function

test_text = normalizer(all_tweets[0])
print('original: ' , all_tweets[0])
print('after normalizing: ',test_text)

original:  #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
after normalizing:  FollowFriday FranceInte PKuchly MilipolParis for being top engaged members in my community this week


In [54]:
# downloading stop words (the words that carry little meaningful information for specific tasks)
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [55]:
# customizing stop words
default_stopwords = set(stopwords.words('english'))
customized_stopwords = default_stopwords - set(['no', 'not', 'ain','aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't",'weren',"weren't",'won',"won't",'wouldn',"wouldn't"])


In [56]:
#The Punkt tokenizer is a pre-trained unsupervised machine learning model used for sentence splitting and word tokenization.
nltk.download('punkt_tab')
# The Averaged Perceptron Tagger is a pre-trained model used for part-of-speech (POS) tagging.
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [57]:
# this is a general function containing all steps of preprocessing

def preprocessor(all_tweets):

    processed_tweets = []

    for tweet in all_tweets:

        # Convert to lowercase
        tweet = tweet.lower()
        #print('lower: ')
        #print(tweet)
        #print('------------------------------------------------')

        # Remove links, tags, and hashtags using regular expressions
        tweet = re.sub(r"http\S+|www\S+|https\S+|@\w+|#\w+", "", tweet)
        #print('remove tags:')
        #print(tweet)
        #print('------------------------------------------------')

        # expanding contractions
        tweet = expand_contractions(tweet)
        #print('expanded : ')
        #print(tweet)
        #print('------------------------------------------------')

        # expanding online abbreviations
        tweet = replace_abbreviations(tweet , slang_dict)
        #print('abbr: ')
        #print(tweet)
        #print('------------------------------------------------')

        # normalizing
        normalized_tweet = normalizer(tweet)
        #print('normalized:')
        #print(normalized_tweet)
        #print('------------------------------------------------')

        # tokenize into words
        tokenized_tweet = word_tokenize(normalized_tweet)
        #print('tokenize')
        #print(tokenized_tweet)
        #print('------------------------------------------------')

        # deleting stop words
        filtered_tokens = [token for token in tokenized_tweet if token not in customized_stopwords]
        #print('removing stop words: ')
        #print(filtered_tokens)
        #print('------------------------------------------------')

        # pos tagging
        tagged_tokens = nltk.pos_tag(filtered_tokens)
        #print('pos tagging: ')
        #print(tagged_tokens)
        #print('------------------------------------------------')

        # finding roots of words
        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()

        root_tokens = []
        for token in tagged_tokens:
            if token[1] == 'JJ':
                root = stemmer.stem(token[0])
            else:
                root = lemmatizer.lemmatize(token[0])

            root_tokens.append(root)

        #print('root tokens:')
        #print(root_tokens)
        #print('------------------------------------------------')
        # joining tokens together
        joined_tweet = ' '.join(root_tokens)
        #print('joined tokens: ')
        #print(joined_tweet)
        #print('------------------------------------------------')
        processed_tweets.append(joined_tweet)

    return processed_tweets


In [58]:
processed_tweets = preprocessor(all_tweets)

In [59]:
for idx in range(10):
  print('original tweet:')
  print(all_tweets[idx])
  print('processed tweet:')
  print(processed_tweets[idx])
  print('----------------------------------------------------')

original tweet:
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
processed tweet:
top engaged member community week smile
----------------------------------------------------
original tweet:
@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!
processed tweet:
hey james odd unsur skeptic please call contact centre abl assist smile mani thanks
----------------------------------------------------
original tweet:
@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!
processed tweet:
listen last night smile bleed amazing track scotland
----------------------------------------------------
original tweet:
@97sides CONGRATS :)
processed tweet:
congrats smile
----------------------------------------------------
original tweet:
yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on m

In [60]:
# Step2: convert the text data to numerical type (TF_IDF)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_tweets)

In [61]:
print(X)
print(X.shape)
print(type(X))

  (0, 9135)	0.42847524637642237
  (0, 2741)	0.5279066400733814
  (0, 5589)	0.4593422200086518
  (0, 1761)	0.4359033601146696
  (0, 9762)	0.3425431208483619
  (0, 8178)	0.13921472094152296
  (1, 8178)	0.0822377318060899
  (1, 3959)	0.2142193622412416
  (1, 4523)	0.296297291132831
  (1, 6272)	0.3384323535186648
  (1, 9438)	0.3032898673574324
  (1, 8117)	0.2932274142380037
  (1, 6843)	0.1707298000103171
  (1, 1323)	0.2408036267690537
  (1, 1838)	0.2932274142380037
  (1, 1444)	0.3384323535186648
  (1, 30)	0.25867955993522845
  (1, 539)	0.36501661804647695
  (1, 5428)	0.23521514362735854
  (1, 8920)	0.15465827475283603
  (2, 8178)	0.10991697654672804
  (2, 5137)	0.3649523008293712
  (2, 4981)	0.2792872928413901
  (2, 6106)	0.2819858990849044
  (2, 1015)	0.46708761924648545
  :	:
  (9994, 3300)	0.12768143181595187
  (9994, 9653)	0.6088159899324703
  (9995, 1476)	0.4399763088825327
  (9995, 9698)	0.2778233627185309
  (9995, 597)	0.5589352743960707
  (9995, 3300)	0.1325173103868571
  (9995, 94

In [62]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)



In [63]:
# # Train the logistic regression model
# model = LogisticRegression()
# model.fit(X_train, y_train)
# # predicting the test set
# y_pred = model.predict(X_test)

# # Evaluating the model
# print(f1_score(y_test, y_pred))


In [64]:
# creating the model

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, f1_score

# Define the parameter grids for each model
param_grid_lr = {
    'C': [0.1, 1, 10],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Regularization type
    'solver': ['liblinear']  # Solver for optimization
}

param_grid_nb = {
    'alpha': [0.1, 0.5, 1.0],  # Smoothing parameter
    'fit_prior': [True, False]  # Whether to learn class prior probabilities
}

param_grid_svm = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'kernel': ['linear', 'rbf'],  # Kernel type
    'gamma': ['scale', 'auto']  # Kernel coefficient for 'rbf'
}

# Initialize the models
log_reg = LogisticRegression()
naive_bayes = MultinomialNB()
svm = SVC()

# Perform Grid Search for Logistic Regression
grid_search_lr = GridSearchCV(log_reg, param_grid_lr, cv=5, scoring='f1', verbose=1)
grid_search_lr.fit(X_train, y_train)

# Perform Grid Search for Naive Bayes
grid_search_nb = GridSearchCV(naive_bayes, param_grid_nb, cv=5, scoring='f1', verbose=1)
grid_search_nb.fit(X_train, y_train)

# Perform Grid Search for SVM
grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring='f1', verbose=1)
grid_search_svm.fit(X_train, y_train)

# Get the best models
best_lr = grid_search_lr.best_estimator_
best_nb = grid_search_nb.best_estimator_
best_svm = grid_search_svm.best_estimator_

# Evaluate the best models on the test set
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")

print("Logistic Regression Results:")
evaluate_model(best_lr, X_test, y_test)

print("Naive Bayes Results:")
evaluate_model(best_nb, X_test, y_test)

print("SVM Results:")
evaluate_model(best_svm, X_test, y_test)

# Print the best hyperparameters
print("Best Logistic Regression Parameters:", grid_search_lr.best_params_)
print("Best Naive Bayes Parameters:", grid_search_nb.best_params_)
print("Best SVM Parameters:", grid_search_svm.best_params_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits




Fitting 5 folds for each of 6 candidates, totalling 30 fits
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.96      0.92      0.94       508
           1       0.92      0.96      0.94       492

    accuracy                           0.94      1000
   macro avg       0.94      0.94      0.94      1000
weighted avg       0.94      0.94      0.94      1000

F1 Score: 0.9414
Naive Bayes Results:
              precision    recall  f1-score   support

           0       0.88      0.93      0.91       508
           1       0.92      0.87      0.90       492

    accuracy                           0.90      1000
   macro avg       0.90      0.90      0.90      1000
weighted avg       0.90      0.90      0.90      1000

F1 Score: 0.8966
SVM Results:
              precision    recall  f1-score   support

           0       0.95      0.93      0.94       508
           1       