***Define the Problem Goal #1***


In [None]:
#Import Libraries
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import matplotlib.pyplot as plt


In [None]:
#an English lexical database (dictionary) built primarily for NLP
#disregard while indexing and retrieving entries
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

***Data Collection & Preparation for the model - Goal #2***

In [None]:
df = pd.read_csv('sentimentdataset.csv')

In [None]:
df.columns


Index(['Unnamed: 0.1', 'Unnamed: 0', 'Text', 'Sentiment', 'Timestamp', 'User',
       'Platform', 'Hashtags', 'Retweets', 'Likes', 'Country', 'Year', 'Month',
       'Day', 'Hour'],
      dtype='object')

In [None]:
#returns a specified number of rows, string from the top
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,0,Enjoying a beautiful day at the park!,Positive,15/01/2023 12:30,User123,Twitter,#Nature #Park,15,30,USA,2023,1,15,12
1,1,1,Traffic was terrible this morning.,Negative,15/01/2023 8:45,CommuterX,Twitter,#Traffic #Morning,5,10,Canada,2023,1,15,8
2,2,2,Just finished an amazing workout! ðŸ’ª,Positive,15/01/2023 15:45,FitnessFan,Instagram,#Fitness #Workout,20,40,USA,2023,1,15,15
3,3,3,Excited about the upcoming weekend getaway!,Positive,15/01/2023 18:20,AdventureX,Facebook,#Travel #Adventure,8,15,UK,2023,1,15,18
4,4,4,Trying out a new recipe for dinner tonight.,Neutral,15/01/2023 19:55,ChefCook,Instagram,#Cooking #Food,12,25,Australia,2023,1,15,19


In [None]:
# to check all the rows and columnns.
df.shape

(732, 15)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0.1  732 non-null    int64 
 1   Unnamed: 0    732 non-null    int64 
 2   Text          732 non-null    object
 3   Sentiment     732 non-null    object
 4   Timestamp     732 non-null    object
 5   User          732 non-null    object
 6   Platform      732 non-null    object
 7   Hashtags      732 non-null    object
 8   Retweets      732 non-null    int64 
 9   Likes         732 non-null    int64 
 10  Country       732 non-null    object
 11  Year          732 non-null    int64 
 12  Month         732 non-null    int64 
 13  Day           732 non-null    int64 
 14  Hour          732 non-null    int64 
dtypes: int64(8), object(7)
memory usage: 85.9+ KB


***Text Processing - Clean and Normalize texts - Lemma the words - Goal #3***

In [None]:
# stop words, recognized english is english lol
# lemma is like a dictionary of py
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [None]:
def clean_text(text):
    try:
        text = text.lower() # Convert text to lower case
        text = text.encode("ascii", "ignore").decode() # Remove non-ASCII characters (e.g., emojis, foreign characters)
        text = text.translate(str.maketrans("", "", string.punctuation)) # Remove punctuation using translation table
        text = re.sub(r"\d+", "", text) # Remove digits
        text = re.sub(r"\s+", " ", text).strip()  # Replace multiple spaces with a single space and trim leading/trailing spaces
        words = text.split() # Tokenize the text by splitting into words
        words = [word for word in words if word not in stop_words] # Remove stopwords (common words like "the", "is", etc.)
        words = [lemmatizer.lemmatize(word) for word in words] # Lemmatize each word (convert to base form, e.g., "running" â†’ "run")
        return " ".join(words)
    except Exception:
        return ""

In [None]:
df["Cleaned_Text"] = df["Text"].apply(clean_text)


In [None]:
df = df.drop(columns=["Text"])
print(df.to_string(index=False))

 Unnamed: 0.1  Unnamed: 0              Sentiment        Timestamp                                    User    Platform                                          Hashtags  Retweets  Likes              Country  Year  Month  Day  Hour                                                                                                                      Cleaned_Text
            0           0             Positive   15/01/2023 12:30                           User123         Twitter           #Nature #Park                                    15     30            USA        2023      1   15    12                                                                                                       enjoying beautiful day park
            1           1             Negative    15/01/2023 8:45                           CommuterX       Twitter           #Traffic #Morning                                 5     10            Canada     2023      1   15     8                                                   

In [None]:
df.shape

(732, 15)

In [None]:
# cat_col is my variable short term for Categorical columns it will select all the object
cat_col = [col for col in df.columns if df[col].dtype == 'object']
print('Categorical columns :',cat_col)
# Numerical columns it will select all the int
num_col = [col for col in df.columns if df[col].dtype != 'object']
print('Numerical columns :',num_col)

Categorical columns : ['Sentiment', 'Timestamp', 'User', 'Platform', 'Hashtags', 'Country', 'Cleaned_Text']
Numerical columns : ['Unnamed: 0.1', 'Unnamed: 0', 'Retweets', 'Likes', 'Year', 'Month', 'Day', 'Hour']


In [None]:
# Checking if there is (null value) in the column.
round((df.isnull().sum()/df.shape[0])*100,2)

Unnamed: 0,0
Unnamed: 0.1,0.0
Unnamed: 0,0.0
Sentiment,0.0
Timestamp,0.0
User,0.0
Platform,0.0
Hashtags,0.0
Retweets,0.0
Likes,0.0
Country,0.0


In [None]:
#Remove non-informative columns. As you can see this Unnamed is the non informative columns.
df2 = df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], errors='ignore')

In [None]:
df2.head()

Unnamed: 0,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour,Cleaned_Text
0,Positive,15/01/2023 12:30,User123,Twitter,#Nature #Park,15,30,USA,2023,1,15,12,enjoying beautiful day park
1,Negative,15/01/2023 8:45,CommuterX,Twitter,#Traffic #Morning,5,10,Canada,2023,1,15,8,traffic terrible morning
2,Positive,15/01/2023 15:45,FitnessFan,Instagram,#Fitness #Workout,20,40,USA,2023,1,15,15,finished amazing workout
3,Positive,15/01/2023 18:20,AdventureX,Facebook,#Travel #Adventure,8,15,UK,2023,1,15,18,excited upcoming weekend getaway
4,Neutral,15/01/2023 19:55,ChefCook,Instagram,#Cooking #Food,12,25,Australia,2023,1,15,19,trying new recipe dinner tonight


In [None]:
#To Check all the columns
df2.columns

Index(['Sentiment', 'Timestamp', 'User', 'Platform', 'Hashtags', 'Retweets',
       'Likes', 'Country', 'Year', 'Month', 'Day', 'Hour', 'Cleaned_Text'],
      dtype='object')

In [None]:
# Concise summary of our data set
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Cleaned_Text       732 non-null    object
 1   Cleaned_Sentiment  732 non-null    object
 2   User               732 non-null    object
 3   Country            732 non-null    object
 4   Timestamp          732 non-null    object
 5   Platform           732 non-null    object
 6   Hashtags           732 non-null    object
 7   Retweets           732 non-null    int64 
 8   Likes              732 non-null    int64 
 9   Month              732 non-null    int64 
 10  Day                732 non-null    int64 
 11  Year               732 non-null    int64 
 12  Hour               732 non-null    int64 
dtypes: int64(6), object(7)
memory usage: 74.5+ KB


In [None]:
# Month should be between 1 and 12
valid_months = df2['Month'].between(1, 12).all()

# Day should be between 1 and 31
valid_days = df2['Day'].between(1, 31).all()

# Hour should be between 0 and 23
valid_hours = df2['Hour'].between(0, 23).all()

# Likes and Retweets should be non-negative
valid_likes = (df2['Likes'] >= 0).all()
valid_retweets = (df2['Retweets'] >= 0).all()

In [None]:
#based on my description, describe the index of each column or values represent each column.
df2.dtypes

Unnamed: 0,0
Sentiment,object
Timestamp,object
User,object
Platform,object
Hashtags,object
Retweets,int64
Likes,int64
Country,object
Year,int64
Month,int64


In [None]:
def Cleaned_Sentiment(label):
    label = str(label).strip().lower()

    #if all the word listed in this it will be positive
    if any(word in label for word in ["joy", "happiness", "love", "admiration", "excitement", "hope", "gratitude", "euphoria",
    "affection", "elation", "contentment", "curiosity", "inspiration", "confidence",
    "pride", "enthusiasm", "serenity", "freedom", "kind", "fun", "amusement", "playful", "empowerment",
    "accomplishment", "success", "awe", "wonder", "blessed", "zest", "cheerful","positive"]):
        return "positive"
    #elif  all the word listed in this it will be negative
    elif any(word in label for word in [  "anger", "fear", "sadness", "disgust", "disappointed", "grief", "despair", "loneliness", "regret",
    "envy", "jealousy", "frustration", "isolation", "boredom", "resentment", "bitterness", "sorrow",
    "hate", "heartbreak", "loss", "betrayal", "devastated", "numbness", "dismay", "desolation", "darkness","negative"]):
        return "negative"
    #elif  all the word listed in this it will be neutral
    elif any(word in label for word in ["neutral", "acceptance", "reflection", "ambivalence", "calmness", "contemplation", "serenity", "mindfulness","neutral"]):
        return "neutral"
    #last it will be neutral if there is no choice given or label in the sections.
    else:
      return "neutral"

In [None]:
df2['Cleaned_Sentiment'] = df2['Sentiment'].apply(map_sentiment)

In [None]:
#drop means to remove the column
df2 = df2.drop(columns=["Sentiment"])
print(df2.to_string(index=False))

       Timestamp                                    User    Platform                                          Hashtags  Retweets  Likes              Country  Year  Month  Day  Hour                                                                                                                      Cleaned_Text Cleaned_Sentiment
15/01/2023 12:30                           User123         Twitter           #Nature #Park                                    15     30            USA        2023      1   15    12                                                                                                       enjoying beautiful day park          positive
 15/01/2023 8:45                           CommuterX       Twitter           #Traffic #Morning                                 5     10            Canada     2023      1   15     8                                                                                                          traffic terrible morning          negative
15/01/2023 15

In [None]:
df2.columns

Index(['Timestamp', 'User', 'Platform', 'Hashtags', 'Retweets', 'Likes',
       'Country', 'Year', 'Month', 'Day', 'Hour', 'Cleaned_Text',
       'Cleaned_Sentiment'],
      dtype='object')

In [None]:
#In this section you will see i want to organize my columns, i use the in_ordeyby as my variable and inside it is my columns, to print in order columns.
in_orderby = ['Cleaned_Text','Cleaned_Sentiment', 'User', 'Country','Timestamp','Platform', 'Hashtags', 'Retweets', 'Likes','Month','Day', 'Year',  'Hour' ]
df2 = df2[in_orderby]
print (df2.to_string(index=False))

                                                                                                                     Cleaned_Text Cleaned_Sentiment                                    User              Country        Timestamp    Platform                                          Hashtags  Retweets  Likes  Month  Day  Year  Hour
                                                                                                      enjoying beautiful day park          positive                           User123                  USA       15/01/2023 12:30   Twitter           #Nature #Park                                    15     30      1   15  2023    12
                                                                                                         traffic terrible morning          negative                           CommuterX                Canada     15/01/2023 8:45   Twitter           #Traffic #Morning                                 5     10      1   15  2023     8
             

In [None]:
#checking of my columns if it is really in column :D
df2.columns

Index(['Cleaned_Text', 'Cleaned_Sentiment', 'User', 'Country', 'Timestamp',
       'Platform', 'Hashtags', 'Retweets', 'Likes', 'Month', 'Day', 'Year',
       'Hour'],
      dtype='object')