In [58]:
# For suppressing warnings
from warnings import filterwarnings
filterwarnings('ignore')


# Importing required modules
import re
from string import punctuation
import numpy as np
import pandas as pd


# For NLP
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# For ML
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import ConfusionMatrixDisplay

In [59]:
# Reading data from the file: "twitter_training.csv"
train_filename = "data/data/twitter_training.csv"
train_filename = "dataset_.csv"

df_train = pd.read_csv(train_filename, names = ['label', 'tweet', 'labeSl', 'tweAet'])

df_train.head()

Unnamed: 0,label,tweet,labeSl,tweAet
0,Label,Text,,
1,1,im getting borderlands kill,,
2,1,im coming borderlands murder,,
3,1,im getting borderlands murder,,
4,1,im getting borderlands murder,,


In [60]:
# Reading data from the file: "twitter_validation.csv"
test_filename = "data/data/twitter_validation.csv"
df_test = pd.read_csv(test_filename, names = ['id', 'game/company', 'label', 'tweet'])
df_test.head()

Unnamed: 0,id,game/company,label,tweet
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [61]:
print(df_train.shape)
print(df_test.shape)

(71973, 4)
(1000, 4)


In [62]:
# Describing the data
df_train.describe(include='object').T

Unnamed: 0,count,unique,top,freq
label,71973,5,0,21811
tweet,71973,61740,time despite fact currently million people liv...,172


In [63]:
# Checking the datatypes
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71973 entries, 0 to 71972
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   label   71973 non-null  object 
 1   tweet   71973 non-null  object 
 2   labeSl  0 non-null      float64
 3   tweAet  0 non-null      float64
dtypes: float64(2), object(2)
memory usage: 2.2+ MB


In [64]:
df_train.isnull().sum()

label         0
tweet         0
labeSl    71973
tweAet    71973
dtype: int64

In [65]:
df_train[df_train['tweet'].isnull()]

Unnamed: 0,label,tweet,labeSl,tweAet


In [66]:
# Dropping the rows with null tweets
df_train = df_train.dropna()
df_train.isnull().sum()

label     0
tweet     0
labeSl    0
tweAet    0
dtype: int64

In [67]:
print(df_test.describe(include='object').T)
print(df_test.info())

             count unique                     top freq
game/company  1000     32  RedDeadRedemption(RDR)   40
label         1000      4                 Neutral  285
tweet         1000    999                     Wow    2
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            1000 non-null   int64 
 1   game/company  1000 non-null   object
 2   label         1000 non-null   object
 3   tweet         1000 non-null   object
dtypes: int64(1), object(3)
memory usage: 31.4+ KB
None


In [68]:
df_test[df_test['tweet'].isnull()]

Unnamed: 0,id,game/company,label,tweet


In [75]:
# Creating a function for lowercasing the tweets
def convert_to_lowercase(tweets):
    """
    Function for converting the text in the input series to lowercase.

    Parameter:
        - tweets: A Pandas Series containing text data.

    Returns:
        - A Pandas Series with all text in lowercase.
    """

    return tweets.str.lower()
convert_to_lowercase(df_train['tweet'][:10])

Series([], Name: tweet, dtype: object)

In [76]:
# Checking if we have any HTML/XML tags in tweets
df_train[df_train['tweet'].str.contains('<.*?>')]

Unnamed: 0,label,tweet,labeSl,tweAet


In [71]:
def remove_html_tags(tweets):
    """
    Function for removing HTML/XML tags from the input series.

    Parameter:
        - tweets: A Pandas Series containing text with HTML/XML tags.

    Returns:
        - A Pandas Series with HTML/XML tags removed.
    """

    return tweets.str.replace("<.*?>", "", regex=True)

In [72]:
remove_html_tags(df_train[df_train['tweet'].str.contains('<.*?>', regex = True)]['tweet'][:10])

Series([], Name: tweet, dtype: object)

In [73]:
remove_html_tags(df_train[df_train['tweet'].str.contains('<.*?>', regex = True)]['tweet'][:10])

Series([], Name: tweet, dtype: object)

In [77]:
# Creating a function for removing urls from the tweets
def remove_urls(tweets):
    """
    Function for removing URLs from the input series.

    Parameter:
        - tweets: A Pandas Series containing text with URLs.

    Returns:
        - A Pandas Series with URLs removed.
    """

    return tweets.str.replace(r"https?://\S+|www\.\S+", "", regex=True)

# Testing remove_urls function
# remove_urls(df_train[df_train['tweet'].str.contains(r'https?://\S+|www\.\S+', regex = True)]['tweet'][:10]).iloc[0]

In [78]:
# Creating a function for removing punctuations from the tweets
def remove_punctuations(tweets):
    """
    Function for removing punctuation from the input series.

    Parameter:
        - tweets: A Pandas Series containing text with punctuation.

    Returns:
        - A Pandas Series with punctuation removed.
    """

    return tweets.str.translate(str.maketrans("", "", punctuation))

In [79]:
# Dictionary of common chat abbreviations and their full forms.
# This dictionary contains popular shorthand used in text messaging and online chatting.
# Each key is a chat abbreviation, and the corresponding value is the full phrase or meaning.
# Reference: https://github.com/rishabhverma17/sms_slang_translator/blob/master/slang.txt

chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "ILU: I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don't care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can't stop laughing",
    "L8": "Late",
    "SMH": "Shaking My Head",
    "YOLO": "You Only Live Once",
    "TLDR": "Too Long; Didn't Read",
    "FOMO": "Fear Of Missing Out",
    "IDK": "I Don't Know",
    "BFFL": "Best Friends For Life",
    "TMI": "Too Much Information",
    "DM": "Direct Message",
    "STFU": "Shut The F... Up",
    "WTH": "What The Heck",
    "LMAOROTF": "Laughing My A... Off Rolling On The Floor",
    "PPL": "People",
    "SFLR": "Sorry For Late Reply",
    "G2G": "Got To Go",
    "S2R": "Send To Receive"
}

In [80]:
# Creating a function for replacing chat abbreviations with their full meanings
def apply_chat_word_map(tweets):
    """
    Function for replacing abbreviations in the text with their full meanings using a chat word dictionary.

    Parameters:
        - tweets: A pandas Series containing tweets with abbreviations.

    Returns:
        - A pandas Series with abbreviations replaced by their full meanings. 
    """
    return tweets.apply(lambda text: " ".join([chat_words.get(word.upper(), word) for word in text.split()]))

In [81]:
# Testing the apply_chat_word_map function
print(apply_chat_word_map(pd.Series(['IMHO he is the best']))[0])
print(apply_chat_word_map(pd.Series(['FYI Delhi is the capital of India']))[0])

In My Honest/Humble Opinion he is the best
For Your Information Delhi is the capital of India


In [82]:
# Creating a function for removing stop words from the tweets
def remove_stopwords(tweets):
    """
    Function for removing common stopwords from the tweets.

    Parameters:
        - tweets: A pandas Series containing tweets with potential stopwords.

    Returns:
        - A pandas Series with stopwords removed from each tweet. 
    """
    stop_words = set(stopwords.words("english"))
    return tweets.apply(lambda text: " ".join([word for word in text.split() if word not in stop_words]))

In [83]:
# Testing remove_stopwords function
print(remove_stopwords(pd.Series(['I am coming to the borders and I will kill you...']))[0])
print(remove_stopwords(pd.Series(['im coming on borderlands and i will murder you...']))[0])
print(remove_stopwords(pd.Series(['So I spent a few hours making something for fu...']))[0])
print(remove_stopwords(pd.Series(['So I spent a couple of hours doing something f...']))[0])

I coming borders I kill you...
im coming borderlands murder you...
So I spent hours making something fu...
So I spent couple hours something f...


In [None]:
# Regex to finding emojis
emoji_pattern = (
        "[\U0001F600-\U0001F64F" # Emoticons
        "\U0001F300-\U0001F5FF"  # Miscellaneous Symbols and Pictographs
        "\U0001F680-\U0001F6FF"  # Transport and Map Symbols
        "\U0001F700-\U0001F77F"  # Alchemical Symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"  # Enclosed characters
        "]+")

# Checking if we have emojis in our dataset
df_train[df_train['tweet'].str.contains(emoji_pattern, regex = True)]['tweet']

576      Top 4 favourite games you say? 🤔. . Sea of Thi...
579      Top 4 favourite games you say? 🤔.. Sea of Thie...
648      Morning~!!. I'm split on playing PSO2 or Borde...
651      Morning~!!. I'm split on playing PSO2 or Borde...
652      1 Morning~!!. I'm split on playing PSO2 or Bor...
                               ...                        
74196    @nvidia🤔 why? But why?  We buy 2 cards convinc...
74199    @nvidia🤔 why? But why?  We buy 2 teams convinc...
74220    Let’s just say Nvidia users can be blessed wit...
74223    Let’s all say Nvidia users can be blessed with...
74637                           Love EVERYTHING about it �
Name: tweet, Length: 848, dtype: object

In [85]:
# Creating a function for removing emojis from the tweets
def remove_emojis(tweets):
    """
    Function to remove emojis from tweets.

    Parameters:
        - tweets: A pandas Series containing tweets with possible emojis.

    Returns:
        - A pandas Series with emojis removed from each tweet.
    """
    return tweets.str.replace(emoji_pattern, "", regex=True)

In [86]:
# Testing remove_emojis function
remove_emojis(df_train[df_train['tweet'].str.contains(emoji_pattern, regex = True)]['tweet'][:10])

Series([], Name: tweet, dtype: object)

In [87]:
# Creating a function for removing extra whitespaces from the tweets
def remove_extra_whitespaces(tweets):
    """
    Function for removing extra whitespaces from the tweets.

    Parameters:
        - tweets: A pandas Series containing tweets with extra whitespaces.

    Returns:
        - A pandas Series with extra whitespaces removed.
    """
    
    # Replacing multiple whitespaces with a single space
    tweets = tweets.str.replace(r"\s+", " ", regex=True)
    
    # Trimming leading and trailing whitespaces
    tweets = tweets.str.strip()
    
    return tweets

In [88]:
# Creating a function for lemmatization tweets
def lemmatize_text(tweets):
    """
    Function for lemmatizing each word of the tweets.

    Parameters:
        - tweets: A pandas Series containing tweets.

    Returns:
        - A pandas Series with lemmatized tweets.
    """
    lemmatizer = WordNetLemmatizer()
    return tweets.apply(lambda text: " ".join([lemmatizer.lemmatize(word) for word in word_tokenize(text)]))

In [89]:
# Creatting a pipeline to process the tweets data
tweet_processing_pipeline = Pipeline([
    ('convert_to_lowercase', FunctionTransformer(convert_to_lowercase)),
    ('remove_html_tags', FunctionTransformer(remove_html_tags)),
    ('remove_urls', FunctionTransformer(remove_urls)),
    ('remove_emojis', FunctionTransformer(remove_emojis)),
    ('chat_word_map', FunctionTransformer(apply_chat_word_map)),
    ('remove_punctuations', FunctionTransformer(remove_punctuations)),
    ('remove_stopwords', FunctionTransformer(remove_stopwords)),
    ('remove_extra_space', FunctionTransformer(remove_extra_whitespaces)),
    ('lemmatize', FunctionTransformer(lemmatize_text)),
    ('TF-Idf_train', TfidfVectorizer())
])

In [90]:
# Splitting the data into X_train and y_train
X_train = df_train['tweet']
y_train = df_train['label']

# Splitting the data into X_test and y_test
X_test = df_test['tweet']
y_test = df_test['label']

In [91]:
# Encoding the target variable
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

ValueError: y contains previously unseen labels: 'Irrelevant'

In [92]:
# Reshaping y into (n, 1) array for machine learning models
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

AttributeError: 'Series' object has no attribute 'reshape'

In [None]:
# Checking which class is assigned to which encoded label
dict(zip(encoder.classes_, range(len(encoder.classes_))))

{'Irrelevant': 0, 'Negative': 1, 'Neutral': 2, 'Positive': 3}

In [93]:
import nltk
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/thientran/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/thientran/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [94]:
# Transforming train and test data
X_train = tweet_processing_pipeline.fit_transform(X_train)
X_test = tweet_processing_pipeline.transform(X_test)

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
X_train.shape, X_test.shape

((73996, 36571), (1000, 36571))

In [None]:
# Creating an instance of LogisticRegression and training it
model = LogisticRegression(C = 10, solver='saga', max_iter=10000, random_state=101)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
# Evaluating the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted')}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")
print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted')}")
print(f"ROC-AUC Score: {roc_auc_score(y_test, model.predict_proba(X_test), average='weighted', multi_class='ovr')}")

Accuracy: 0.959
Precision: 0.9594014502981741
Recall: 0.959
F1-Score: 0.9590077778303798
ROC-AUC Score: 0.9930701159064427


In [None]:
type(X_test), type(y_test)
X_train.head()
y_train.show()

AttributeError: 'csr_matrix' object has no attribute 'head'

In [None]:
samples = [
        ("I absolutely love this product, it’s fantastic!", 1),
        ("Terrible customer service, I’m very disappointed.", 0),
        ("The item arrived as described, nothing more.", 2),
        ("What’s the weather like today?", 3),
        ("Best purchase I’ve made this year!", 1),
        ("I don’t hate it, but I wouldn’t buy it again.", 2),
        ("Broken when arrived. Waste of money!", 0),
        ("Can someone explain how this works?", 3),
        ("Decent quality for the price, meets expectations.", 2),
        ("Super fast delivery and excellent packaging!", 1)
    ]

In [None]:
# Sample test cases
test_samples = [
    "I absolutely love this product, it’s fantastic!",  # Expected: Positive
    "Terrible customer service, I’m very disappointed.",  # Expected: Negative
    "The item arrived as described, nothing more.",  # Expected: Neutral
    "What’s the weather like today?",  # Expected: Irrelevant
    "Best purchase I’ve made this year!",  # Expected: Positive
    "Broken when arrived. Waste of money!",  # Expected: Negative
    "Can someone explain how this works?",  # Expected: Irrelevant
    "Decent quality for the price, meets expectations.",  # Expected: Neutral
    "Super fast delivery and excellent packaging!",  # Expected
    "Positive",
    "I don’t hate it, but I wouldn’t buy it again.",  # Expected: Neutral
    "I love the design, but the functionality is lacking.",  # Expected: Neutral
    "The product is okay, but I expected more.",  # Expected: Neutral
    # "I’m not sure if I like it or not.",  # Expected: Neutral
    # "This is the worst experience I’ve ever had.",  # Expected: Negative
    # "I’m really happy with my purchase!",  # Expected: Positive
    # "The instructions were unclear and confusing.",  # Expected: Negative
    # "I would recommend this to my friends.",  # Expected: Positive
    # "It’s not what I expected, but it’s still good.",  # Expected

]


# Preprocess the test samples using the pipeline
processed_samples = tweet_processing_pipeline.transform(pd.Series(test_samples))

# Predict using the trained model
predicted_labels = model.predict(processed_samples)

# Decode the predicted labels back to their original form
decoded_labels = encoder.inverse_transform(predicted_labels)

# Display the results
for i, sample in enumerate(test_samples):
    print(f"Tweet: {sample}")
    print(f"Predicted Label: {decoded_labels[i]}")
    print("-" * 50)

Tweet: I absolutely love this product, it’s fantastic!
Predicted Label: Positive
--------------------------------------------------
Tweet: Terrible customer service, I’m very disappointed.
Predicted Label: Negative
--------------------------------------------------
Tweet: The item arrived as described, nothing more.
Predicted Label: Neutral
--------------------------------------------------
Tweet: What’s the weather like today?
Predicted Label: Positive
--------------------------------------------------
Tweet: Best purchase I’ve made this year!
Predicted Label: Positive
--------------------------------------------------
Tweet: Broken when arrived. Waste of money!
Predicted Label: Negative
--------------------------------------------------
Tweet: Can someone explain how this works?
Predicted Label: Negative
--------------------------------------------------
Tweet: Decent quality for the price, meets expectations.
Predicted Label: Positive
------------------------------------------------

In [None]:
import pandas as pd
from scipy.sparse import csr_matrix

# Giả sử bạn đã có X_test (csr_matrix) và y_test (ndarray)
# Chuyển sparse matrix sang dense (numpy array)
X_dense = X_test.toarray()

# Gộp X và y lại
import numpy as np
data = np.hstack((X_dense, y_test.reshape(-1, 1)))

# Tạo tên cột
columns = [f'feature_{i}' for i in range(X_dense.shape[1])] + ['target']

# Tạo DataFrame và lưu
df = pd.DataFrame(data, columns=columns)
df.to_csv('x_y_test.csv', index=False)

print("Đã lưu file x_y_test.csv từ sparse matrix thành công.")


KeyboardInterrupt: 