In [39]:
from google.colab import files
uploaded = files.upload()

Saving sentiment.csv to sentiment (2).csv


In [40]:
# Data Handling
import pandas as pd
import numpy as np

# Natural Language Processing (NLP)
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from wordcloud import WordCloud

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Power BI Integration
import openpyxl  # For saving data in Excel format (for Power BI)

In [41]:
#Load the dataset
df = pd.read_csv("sentiment.csv")
df.head()


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,2,2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19


#### The Social Media Sentiments Analysis Dataset captures a vibrant tapestry of emotions, trends, and interactions across various social media platforms. This dataset provides a snapshot of user-generated content, encompassing text, timestamps, hashtags, countries, likes, and retweets. Each entry unveils unique stories—moments of surprise, excitement, admiration, thrill, contentment, and more—shared by individuals worldwide.

### **Key Features**  

| **Feature**   | **Description**  |  
|--------------|----------------|  
| **Text**        | User-generated content showcasing sentiments |  
| **Sentiment**   | Categorized emotions (e.g., Positive, Negative, Neutral) |  
| **Timestamp**   | Date and time information of the post |  
| **User**        | Unique identifier of users contributing to the trend |  
| **Platform**    | Social media platform where the content originated (Twitter, Instagram, LinkedIn) |  
| **Hashtags**    | Identifies trending topics and themes |  
| **Likes**       | Quantifies user engagement (number of likes) |  
| **Retweets**    | Reflects content popularity (number of retweets/shares) |  
| **Country**     | Geographical origin of each post |  
| **Year**        | Year when the post was published |  
| **Month**       | Month when the post was published |  
| **Day**         | Day when the post was published |  
| **Hour**        | Hour of the day when the posts published |  
our of the post

In [42]:
#drop columns named 'unnamed'

df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
df.head()

Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19


In [43]:
#Kenya data
kenya_data = df["Country"] == "Kenya"
kenya_data.sum()

np.int64(0)

In [44]:
#US data
US_data = df["Country"] == "USA"
US_data.sum()

np.int64(9)

In [45]:
df.duplicated().sum()

np.int64(20)

In [46]:
#remove duplicates
df = df.drop_duplicates()

In [47]:
df.duplicated().sum()

np.int64(0)

#### Handling Missing Values


In [48]:
 #Remove rows with missing text
df = df.dropna(subset=["Text"])


#### Handling emojis

In [49]:
!pip install emoji



In [50]:
# Print the original text in the 3rd row
print("Before Emoji Conversion:")
print(df["Text"].iloc[2])

Before Emoji Conversion:
 Just finished an amazing workout! 💪               


In [51]:
import emoji

def convert_emojis(text):
    return emoji.demojize(text).replace(":","")
df["Text"] = df["Text"].apply(convert_emojis)

In [52]:
# Print the text in the 3rd row after emoji removal
print("After Emoji Conversion:")
print(df["Text"].iloc[2])

After Emoji Conversion:
 Just finished an amazing workout! flexed_biceps               


### Handling Stopwords

In [53]:
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords

# Initialize tokenizer and stopwords
tokenizer = TreebankWordTokenizer()
stop_words = set(stopwords.words("english"))

# Function to tokenize and remove stopwords
def remove_stopwords(text):
    tokens = tokenizer.tokenize(text)  # Tokenize
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]  # Remove stopwords
    return " ".join(filtered_tokens)  # Return cleaned text

# Example
text = "Hello world! This is a test."
clean_text = remove_stopwords(text)
print(clean_text)  # Output: "Hello world ! test ."

Hello world ! test .


In [54]:
df["Text"] = df["Text"].apply(remove_stopwords)

In [55]:
# Print the text in the 3rd row after stopwords
print("After stopwords removal:")
print(df["Text"].iloc[2])

After stopwords removal:
finished amazing workout ! flexed_biceps


#### Sentimental analysis using VADER (Valence Aware Dictionary and sEntiment Reasoner)

In [56]:
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

#download VADER lexicon
nltk.download('vader_lexicon')

#Initialize Analyzer
sia = SentimentIntensityAnalyzer()

#function to get sentiment
def get_vader_sentiment(text):
    score = sia.polarity_scores(text)["compound"]
    if score > 0.05:
        return 'Positive'
    if score < 0.05:
        return "Negative"
    else:
        return "Neutral"

#apply on data
df["Sentiment"] = df["Text"].apply(get_vader_sentiment)

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


### Testing the SIA VADER SCORES

In [57]:
text1 = "I love this product! It's amazing."
text2 = "This is the worst experience ever."
text3 = "It's an okay product."

In [58]:
print(f"The scores for text 1 are: {sia.polarity_scores(text1)}\n"
      f"The scores for text 2 are: {sia.polarity_scores(text2)}\n"
      f"The scores for text 3 are: {sia.polarity_scores(text3)}")

The scores for text 1 are: {'neg': 0.0, 'neu': 0.266, 'pos': 0.734, 'compound': 0.8516}
The scores for text 2 are: {'neg': 0.451, 'neu': 0.549, 'pos': 0.0, 'compound': -0.6249}
The scores for text 3 are: {'neg': 0.0, 'neu': 0.612, 'pos': 0.388, 'compound': 0.2263}


### Testing for Text column in the data

In [59]:
df["Sentiment_Scores"] = df["Text"].apply(lambda text: sia.polarity_scores(text))

In [60]:
for i, text in enumerate(df["Text"].head()):  # Limits to first 5 rows
    print(f"Text {i+1} Sentiment Scores: {sia.polarity_scores(text)}")

Text 1 Sentiment Scores: {'neg': 0.0, 'neu': 0.209, 'pos': 0.791, 'compound': 0.8221}
Text 2 Sentiment Scores: {'neg': 0.608, 'neu': 0.392, 'pos': 0.0, 'compound': -0.4767}
Text 3 Sentiment Scores: {'neg': 0.0, 'neu': 0.423, 'pos': 0.577, 'compound': 0.6239}
Text 4 Sentiment Scores: {'neg': 0.0, 'neu': 0.527, 'pos': 0.473, 'compound': 0.4003}
Text 5 Sentiment Scores: {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


In [61]:
from google.colab import files

# Save DataFrame as CSV
df.to_csv("cleaned_sentiment.csv", index=False)

In [62]:
# Download the preprocessed data
files.download("cleaned_sentiment.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>