In [None]:


# Importing required modules
import re
from string import punctuation
import numpy as np
import pandas as pd


# For NLP
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# For ML
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
# Unzipping wordnet.zip file
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
# Reading data from the file
train_filename = "twitter_training.csv"
df_train = pd.read_csv(train_filename, names = ['id', 'game/company', 'label', 'tweet'])
df_train.head()

In [None]:
# Reading data from the file: "twitter_validation.csv"
test_filename = "twitter_training.csv"
df_test = pd.read_csv(test_filename, names = ['id', 'game/company', 'label', 'tweet'])
df_test.head()

In [None]:
# Describing the data
df_train.describe(include='object').T

In [None]:
# Checking the datatypes
df_train.info()

In [None]:
# Checking if we have any null values
df_train.isnull().sum()

In [None]:
# Checking the rows with null tweet values
df_train[df_train['tweet'].isnull()]

In [None]:
# Dropping the rows with null tweets
df_train = df_train.dropna()
df_train.isnull().sum()

---

## **Data Preprocessing**

In [None]:
# Creating a function for lowercasing the tweets
def convert_to_lowercase(tweets):
    """
    Function for converting the text in the input series to lowercase.

    Parameter:
        - tweets: A Pandas Series containing text data.

    Returns:
        - A Pandas Series with all text in lowercase.
    """

    return tweets.str.lower()


In [None]:
# Testing convert_to_lowercase function
convert_to_lowercase(df_train['tweet'][:10])

In [None]:
# Checking if we have any HTML/XML tags in tweets
df_train[df_train['tweet'].str.contains('<.*?>')]

In [None]:
# Creating a function for removing HTML/XML tags from the tweets
def remove_html_tags(tweets):
    """
    Function for removing HTML/XML tags from the input series.

    Parameter:
        - tweets: A Pandas Series containing text with HTML/XML tags.

    Returns:
        - A Pandas Series with HTML/XML tags removed.
    """

    return tweets.str.replace("<.*?>", "", regex=True)


In [None]:
# Testing remove_html_tags function
remove_html_tags(df_train[df_train['tweet'].str.contains('<.*?>', regex = True)]['tweet'][:10])

In [None]:
# Creating a function for removing punctuations from the tweets
def remove_punctuations(tweets):
    """
    Function for removing punctuation from the input series.

    Parameter:
        - tweets: A Pandas Series containing text with punctuation.

    Returns:
        - A Pandas Series with punctuation removed.
    """

    return tweets.str.translate(str.maketrans("", "", punctuation))


In [None]:
# Dictionary of common chat abbreviations and their full forms.
# This dictionary contains popular shorthand used in text messaging and online chatting.
# Each key is a chat abbreviation, and the corresponding value is the full phrase or meaning.
# Reference: https://github.com/rishabhverma17/sms_slang_translator/blob/master/slang.txt

chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "ILU: I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don't care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can't stop laughing",
    "L8": "Late",
    "SMH": "Shaking My Head",
    "YOLO": "You Only Live Once",
    "TLDR": "Too Long; Didn't Read",
    "FOMO": "Fear Of Missing Out",
    "IDK": "I Don't Know",
    "BFFL": "Best Friends For Life",
    "TMI": "Too Much Information",
    "DM": "Direct Message",
    "STFU": "Shut The F... Up",
    "WTH": "What The Heck",
    "LMAOROTF": "Laughing My A... Off Rolling On The Floor",
    "PPL": "People",
    "SFLR": "Sorry For Late Reply",
    "G2G": "Got To Go",
    "S2R": "Send To Receive"
}


In [None]:
# Creating a function for replacing chat abbreviations with their full meanings
def apply_chat_word_map(tweets):
    """
    Function for replacing abbreviations in the text with their full meanings using a chat word dictionary.

    Parameters:
        - tweets: A pandas Series containing tweets with abbreviations.

    Returns:
        - A pandas Series with abbreviations replaced by their full meanings.
    """
    return tweets.apply(lambda text: " ".join([chat_words.get(word.upper(), word) for word in text.split()]))


In [None]:
# Testing the apply_chat_word_map function
print(apply_chat_word_map(pd.Series(['IMHO he is the best']))[0])
print(apply_chat_word_map(pd.Series(['FYI Delhi is the capital of India']))[0])

In [None]:
# Creating a function for removing stop words from the tweets
def remove_stopwords(tweets):
    """
    Function for removing common stopwords from the tweets.

    Parameters:
        - tweets: A pandas Series containing tweets with potential stopwords.

    Returns:
        - A pandas Series with stopwords removed from each tweet.
    """
    stop_words = set(stopwords.words("english"))
    return tweets.apply(lambda text: " ".join([word for word in text.split() if word not in stop_words]))


In [None]:
# Testing remove_stopwords function
print(remove_stopwords(pd.Series(['I am coming to the borders and I will kill you...']))[0])
print(remove_stopwords(pd.Series(['im coming on borderlands and i will murder you...']))[0])
print(remove_stopwords(pd.Series(['So I spent a few hours making something for fu...']))[0])
print(remove_stopwords(pd.Series(['So I spent a couple of hours doing something f...']))[0])

In [None]:
# Regex to finding emojis
emoji_pattern = (
        "[\U0001F600-\U0001F64F" # Emoticons
        "\U0001F300-\U0001F5FF"  # Miscellaneous Symbols and Pictographs
        "\U0001F680-\U0001F6FF"  # Transport and Map Symbols
        "\U0001F700-\U0001F77F"  # Alchemical Symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"  # Enclosed characters
        "]+")

# Checking if we have emojis in our dataset
df_train[df_train['tweet'].str.contains(emoji_pattern, regex = True)]['tweet']

In [None]:
# Creating a function for removing emojis from the tweets
def remove_emojis(tweets):
    """
    Function to remove emojis from tweets.

    Parameters:
        - tweets: A pandas Series containing tweets with possible emojis.

    Returns:
        - A pandas Series with emojis removed from each tweet.
    """
    return tweets.str.replace(emoji_pattern, "", regex=True)


In [None]:
# Testing remove_emojis function
remove_emojis(df_train[df_train['tweet'].str.contains(emoji_pattern, regex = True)]['tweet'][:10])

In [None]:
# Creating a function for removing extra whitespaces from the tweets
def remove_extra_whitespaces(tweets):
    """
    Function for removing extra whitespaces from the tweets.

    Parameters:
        - tweets: A pandas Series containing tweets with extra whitespaces.

    Returns:
        - A pandas Series with extra whitespaces removed.
    """

    # Replacing multiple whitespaces with a single space
    tweets = tweets.str.replace(r"\s+", " ", regex=True)

    # Trimming leading and trailing whitespaces
    tweets = tweets.str.strip()

    return tweets


In [None]:
# Creating a function for lemmatization tweets
def lemmatize_text(tweets):
    """
    Function for lemmatizing each word of the tweets.

    Parameters:
        - tweets: A pandas Series containing tweets.

    Returns:
        - A pandas Series with lemmatized tweets.
    """
    lemmatizer = WordNetLemmatizer()
    return tweets.apply(lambda text: " ".join([lemmatizer.lemmatize(word) for word in word_tokenize(text)]))


## **Pipeline**

In [None]:
# Creatting a pipeline to process the tweets data
tweet_processing_pipeline = Pipeline([
    ('convert_to_lowercase', FunctionTransformer(convert_to_lowercase)),
    ('remove_html_tags', FunctionTransformer(remove_html_tags)),
    ('remove_urls', FunctionTransformer(remove_urls)),
    ('remove_emojis', FunctionTransformer(remove_emojis)),
    ('chat_word_map', FunctionTransformer(apply_chat_word_map)),
    ('remove_punctuations', FunctionTransformer(remove_punctuations)),
    ('remove_stopwords', FunctionTransformer(remove_stopwords)),
    ('remove_extra_space', FunctionTransformer(remove_extra_whitespaces)),
    ('lemmatize', FunctionTransformer(lemmatize_text)),
    ('TF-Idf_train', TfidfVectorizer())
])

In [None]:
# Splitting the data into X_train and y_train
X_train = df_train['tweet']
y_train = df_train['label']

# Splitting the data into X_test and y_test
X_test = df_test['tweet']
y_test = df_test['label']

In [None]:
# Encoding the target variable
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

In [None]:
# Reshaping y into (n, 1) array for machine learning models
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

In [None]:
# Checking which class is assigned to which encoded label
dict(zip(encoder.classes_, range(len(encoder.classes_))))

In [None]:
# Transforming train and test data
X_train = tweet_processing_pipeline.fit_transform(X_train)
X_test = tweet_processing_pipeline.transform(X_test)

---

##   **Logistic Regression**

In [None]:
# Creating an instance of LogisticRegression and training it
model = LogisticRegression(C = 10, solver='saga', max_iter=10000, random_state=101)
model.fit(X_train, y_train)

In [None]:
# Making predictions
y_pred = model.predict(X_test)

In [None]:
# Evaluating the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted')}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")
print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted')}")
print(f"ROC-AUC Score: {roc_auc_score(y_test, model.predict_proba(X_test), average='weighted', multi_class='ovr')}")

In [None]:
# Checking the Confusion Matrix
ConfusionMatrixDisplay.from_estimator(model, X_test, y_test);

---

## **SGDClassifier**

In [None]:
# Creating an instance of SGDClassifier and training it
model = SGDClassifier(loss = "modified_huber", penalty='elasticnet', l1_ratio = 0.05, learning_rate='adaptive',
                      eta0=0.1, early_stopping=True, max_iter=10000, random_state=101)
model.fit(X_train, y_train)

In [None]:
# Making predictions
y_pred = model.predict(X_test)

In [None]:
# Evaluating the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted')}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")
print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted')}")

In [None]:
# Checking the Confusion Matrix
ConfusionMatrixDisplay.from_estimator(model, X_test, y_test)

---

## **RandomForestClassifier**

In [None]:
# Creating an instance of RandomForestClassifier and training it
model = RandomForestClassifier(n_estimators=75, oob_score=True, random_state=101)
model.fit(X_train, y_train)

In [None]:
# Making predictions
y_pred = model.predict(X_test)

In [None]:
# Evaluating the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted')}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")
print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted')}")
print(f"ROC-AUC Score: {roc_auc_score(y_test, model.predict_proba(X_test), average='weighted', multi_class='ovr')}")

In [None]:
# Checking the Confusion Matrix
ConfusionMatrixDisplay.from_estimator(model, X_test, y_test)

---

## **MultinomialNB**

In [None]:
# Creating an instance of MultinomialNB and training it
model = MultinomialNB(alpha=0.001)
model.fit(X_train, y_train)

In [None]:
# Making predictions
y_pred = model.predict(X_test)

In [None]:
# Evaluating the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted')}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")
print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted')}")
print(f"ROC-AUC Score: {roc_auc_score(y_test, model.predict_proba(X_test), average='weighted', multi_class='ovr')}")

In [None]:
# Checking the Confusion Matrix
ConfusionMatrixDisplay.from_estimator(model, X_test, y_test)

---

#Inshights


After evaluating the performance of different models, the **RandomForestClassifier (RFC)** continues to stand out as the top choice for this task. Here's why:

- **RFC** outperforms all other models with the highest **Accuracy (0.969)**, **Precision (0.969)**, **Recall (0.969)**, **F1-Score (0.969)**, and the best **ROC-AUC Score (0.998)**. This demonstrates that the model is exceptionally effective at both classifying correctly and distinguishing between classes, showing high consistency across multiple metrics.

- **Logistic Regression** also delivers strong performance with **Accuracy (0.959)**, **Precision (0.959)**, **Recall (0.959)**, and **F1-Score (0.959)**. It achieves a solid **ROC-AUC Score (0.993)**, but it still trails behind **RFC** in terms of overall performance, particularly in distinguishing between classes as effectively.

- **SGDClassifier** shows good performance, with **Accuracy (0.903)**, **Precision (0.906)**, and **Recall (0.903)**. However, it lags behind both **RFC** and **Logistic Regression** in terms of the other metrics, making it less optimal for this task.

- **MultinomialNB** provides decent results, especially in **Precision (0.887)**, but it still lags behind **RFC** and **Logistic Regression** in terms of overall performance, with **Accuracy (0.882)** and **ROC-AUC Score (0.978)**, indicating it's less effective at capturing the patterns compared to the other models.

In conclusion, while **Logistic Regression** and **SGDClassifier** are solid performers, **RandomForestClassifier** emerges as the clear leader due to its overall superior performance, particularly in accuracy and class distinction.
