In [4]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [5]:
# Load your dataset
df = pd.read_csv('ST.csv')

In [6]:
# Count missing values per column
missing_values = df.isnull().sum()

# Display missing values per column
print("Missing values per column:")
print(missing_values)

Missing values per column:
SR.No        0
text         0
Timestamp    0
Username     0
Platform     0
Hashtags     0
Retweets     0
Likes        0
Country      0
dtype: int64


In [7]:
# Text preprocessing function
def preprocess_text(text):
    text = re.sub(r'http\S+', '', str(text))  # Remove URLs
    text = re.sub(r'@\w+|\#', '', str(text))   # Remove mentions and hashtags
    text = text.encode('ascii', 'ignore').decode('ascii')  # Remove emojis
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]  # Remove stopwords
    cleaned_text = ' '.join(tokens)  # Join tokens
    return cleaned_text


In [8]:
# Apply text preprocessing
df['cleaned_text'] = df['text'].apply(preprocess_text)

In [9]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])

# Latent Dirichlet Allocation (LDA) for topic modeling
lda_model = LatentDirichletAllocation(n_components=2, random_state=42)
lda_matrix = lda_model.fit_transform(tfidf_matrix)

# Add sentiment score and LDA topic features to dataframe
df['topic_1_prob'] = lda_matrix[:, 0]
df['topic_2_prob'] = lda_matrix[:, 1]

# Display the dataframe with engineered features
print(df)


     SR.No                                               text  \
0        1   Enjoying a beautiful day at the park!        ...   
1        2   Traffic was terrible this morning.           ...   
2        3   Just finished an amazing workout! 💪          ...   
3        4   Excited about the upcoming weekend getaway!  ...   
4        5   Trying out a new recipe for dinner tonight.  ...   
..     ...                                                ...   
727    729  Collaborating on a science project that receiv...   
728    730  Attending a surprise birthday party organized ...   
729    731  Successfully fundraising for a school charity ...   
730    732  Participating in a multicultural festival, cel...   
731    733  Organizing a virtual talent show during challe...   

            Timestamp                               Username     Platform  \
0    15-01-2023 12:30                          User123          Twitter     
1    15-01-2023 08:45                          CommuterX        T