#### Week 8 Milestone 3 Author: Rex Gayas Course & Section: DSC360-T301 Data Mining: Text Analytics an (2243-1) Date: 04 FEB 2024

##### Prepare Data Extraction and Preliminary Feature Engineering

In [18]:
import pandas as pd

# Load the dataset
file_path = 'D:/ALPHA/Dynamic Folder/Bellevue/Winter 2023/Data Mining/Project/Datasets/Kaggle/twitter_sentiment_data.csv'
tweets_df = pd.read_csv(file_path)

# Display the first few rows of the dataframe
print(tweets_df.head())


   sentiment                                            message  \
0         -1  @tiniebeany climate change is an interesting h...   
1          1  RT @NatGeoChannel: Watch #BeforeTheFlood right...   
2          1  Fabulous! Leonardo #DiCaprio's film on #climat...   
3          1  RT @Mick_Fanning: Just watched this amazing do...   
4          2  RT @cnalive: Pranita Biswasi, a Lutheran from ...   

              tweetid  
0  792927353886371840  
1  793124211518832641  
2  793124402388832256  
3  793124635873275904  
4  793125156185137153  


In [19]:
# Print out the column names to verify the correct column names to avoid possible errors later
print(tweets_df.columns)


Index(['sentiment', 'message', 'tweetid'], dtype='object')


##### Corpus Normalization

In [21]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Set of NLTK stop words
stop_words = set(stopwords.words('english'))

def clean_tweet(message):
    # Convert to lowercase
    message = message.lower()
    # Remove URLs
    message = re.sub(r'http\S+|www\S+|https\S+', '', message, flags=re.MULTILINE)
    # Remove mentions and hashtags
    message = re.sub(r'\@\w+|\#','', message)
    # Tokenize the message and remove stop words
    message = ' '.join([word for word in word_tokenize(message) if word not in stop_words])
    return message

# Load the dataset
file_path = "D:\\ALPHA\\Dynamic Folder\\Bellevue\\Winter 2023\\Data Mining\\Project\\Datasets\\Kaggle\\twitter_sentiment_data.csv"
tweets_df = pd.read_csv(file_path)

# Apply the cleaning function to the message text
tweets_df['cleaned_text'] = tweets_df['message'].apply(clean_tweet)

# Verify the cleaned text 
print(tweets_df[['message', 'cleaned_text']].head())


                                             message  \
0  @tiniebeany climate change is an interesting h...   
1  RT @NatGeoChannel: Watch #BeforeTheFlood right...   
2  Fabulous! Leonardo #DiCaprio's film on #climat...   
3  RT @Mick_Fanning: Just watched this amazing do...   
4  RT @cnalive: Pranita Biswasi, a Lutheran from ...   

                                        cleaned_text  
0  climate change interesting hustle global warmi...  
1  rt : watch beforetheflood right , travels worl...  
2  fabulous ! leonardo dicaprio 's film climate c...  
3  rt : watched amazing documentary leonardodicap...  
4  rt : pranita biswasi , lutheran odisha , gives...  


##### Bag of Words

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

# Start the CountVectorizer
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(tweets_df['cleaned_text'])

# Feature names
print("Feature names:", vectorizer.get_feature_names_out()[:10])  # print first 10 feature names for verification
# Feature matrix shape
print("Bag of Words Feature Shape:", X_bow.shape)


Feature names: ['00' '000' '00005' '000058' '000s' '000yr' '001' '00322' '005c' '00pm']
Bag of Words Feature Shape: (43943, 31523)


##### TF-IDF

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Start TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(tweets_df['cleaned_text'])

# Feature names
print("Feature names:", tfidf_vectorizer.get_feature_names_out()[:10])  # print first 10 feature names for verification
# Feature matrix shape
print("TF-IDF Feature Shape:", X_tfidf.shape)


Feature names: ['00' '000' '00005' '000058' '000s' '000yr' '001' '00322' '005c' '00pm']
TF-IDF Feature Shape: (43943, 31523)


##### Bag of N-Grams

In [24]:
# Start CountVectorizer with N-Grams
ngram_vectorizer = CountVectorizer(ngram_range=(1, 3))  # Using unigrams, bigrams, and trigrams
X_ngram = ngram_vectorizer.fit_transform(tweets_df['cleaned_text'])

# Feature names
print("Feature names:", ngram_vectorizer.get_feature_names_out()[:10])  # print first 10 feature names for verification
# Feature matrix shape
print("Bag of N-Grams Feature Shape:", X_ngram.shape)


Feature names: ['00' '00 19' '00 19 30' '00 appropriate' '00 appropriate culture'
 '00 gets' '00 gets hotter' '00 hrs' '00 hrs stockholmact' '00 pm']
Bag of N-Grams Feature Shape: (43943, 467247)


##### Topic Modeling

In [25]:
from sklearn.decomposition import LatentDirichletAllocation

# Quantify the number of topics to discover
n_topics = 5

# Use CountVectorizer's output since LDA works with integer counts, the BoW representation
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(X_bow)  # X_bow is from the Bag of Words model

# Function to print the top words for each topic
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

# Print the top words for each topic
print_top_words(lda, vectorizer.get_feature_names_out(), 10)


Topic #0: climate change rt believe amp going world die thinking fight
Topic #1: climate change rt trump action threat planet national via future
Topic #2: climate change rt trump us epa new pruitt scott ðÿ
Topic #3: global warming rt real believe people say like weather man
Topic #4: climate change rt trump via scientists real amp hoax science



##### Clustering

In [16]:
from sklearn.cluster import KMeans

# Number of clusters chosen
n_clusters = 5  

# Perform K-Means clustering on the TF-IDF data
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X_tfidf)

# Assign tweets to clusters
tweets_df['cluster'] = kmeans.labels_

# Check the size of each cluster
print(tweets_df['cluster'].value_counts())

# Examin cluster centroids
print("Top terms per cluster:")
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vectorizer.get_feature_names_out()
for i in range(n_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :10]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))


  super()._check_params_vs_input(X, default_n_init=10)


cluster
4    28547
2     9524
3     4267
0      880
1      725
Name: count, dtype: int64
Top terms per cluster:
Cluster 0: epa pruitt scott chief head dioxide primary carbon rt contributor
Cluster 1: going husband thinking believe die rt tã mr millions elect
Cluster 2: warming global rt real believe think like us people weather
Cluster 3: trump climate change rt donald china hoax thinks president via
Cluster 4: climate change rt amp world via us real new fight
