In [1]:
# preprocessing
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# kmeans
import numpy as np
import pandas as pd

# write log file
import logging

# time
import time
from datetime import datetime
import pytz

In [2]:
import sys
print(sys.version)

3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0]


In [3]:
raw = urlopen("https://raw.githubusercontent.com/SparrowChang/CS6375_assignment3/main/foxnewshealth.txt").read()
raw = BeautifulSoup(raw)
raw_txt = raw.get_text()
raw_txt



In [4]:
def process_tweet_data(raw_txt):
    lines = raw_txt.split('\n')
    lines_clean = []

    for line in lines:
        fields = line.split('|')
        tweet_id = fields[0] # the tweet id
        date_time = fields[1] # timestamp
        tweet_content_with_url = fields[2]

        # Remove the URL and anything after it
        tweet_content = re.sub(r'http://\S+', '', tweet_content_with_url) # remove any URL
        # Remove hashtag symbols
        tweet_content = tweet_content.replace('#', '') # remove any hashtag symbols
        # Remove words starting with the "@" symbol
        tweet_content = re.sub(r'@\S+', '', tweet_content) # remove any word that starts with the symbol @
        tweet_content = tweet_content.lower().strip() # Convert every word to lowercase
        lines_clean.append(tweet_content)

    return lines_clean

In [5]:
cleaned_tweet_content = process_tweet_data(raw_txt)
# check "cleaned_tweet_content" result
for tweet_content in cleaned_tweet_content:
    print("Tweet Content:", tweet_content)

Tweet Content: injury prevention programs unpopular with high school coaches
Tweet Content: 6 dietary changes to make midlife
Tweet Content: massachusetts governor gets head shaved to support charity
Tweet Content: dad wins 3 marathons in 8 days; winnings to help ailing son
Tweet Content: possible cure for melanoma?
Tweet Content: wear orange glasses to get better sleep, study says
Tweet Content: utah girl, 17, has rare allergy to water
Tweet Content: higher purpose in life tied to better brain health
Tweet Content: kids may be more likely to use customized playgrounds
Tweet Content: marijuana harvested for medical use in chile
Tweet Content: museums offer array of programs for the disabled
Tweet Content: girl born with heart on wrong side of body prepares for 8th birthday
Tweet Content: boy gets food allergies from blood transfusion
Tweet Content: chinese citizens sue government over transparency on monsanto herbicide
Tweet Content: 'princess lacey' tribute begins again for michigan s

In [6]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

def kmeans_cluster(data, k, max_iterations_list):
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(data)
    X = tfidf_matrix.toarray()

    n_samples, n_features = X.shape
    # Print the shape of the matrix
    # print("X.shape:", n_samples, n_features) # X.shape: 2000 4180

    # Initialize cluster centroids randomly
    np.random.seed(42)
    centroids_indices = np.random.choice(n_samples, k, replace=False)
    centroids = X[centroids_indices]

    # Placeholder for cluster assignments
    cluster_assignments = np.zeros(n_samples, dtype=int)
    results = []

    for _ in range(max_iterations):
        # Assign each data point to the nearest centroid
        for i in range(n_samples):
            distances = [euclidean_distance(X[i], centroid) for centroid in centroids]
            cluster_assignments[i] = np.argmin(distances) + 1  # Update cluster assignment index

        # Update centroids
        for i in range(k):
            cluster_points = X[cluster_assignments == (i + 1)]  # Update cluster assignment index
            centroids[i] = np.mean(cluster_points, axis=0)

    # Calculate Sum of Squared Error (SSE) for each cluster
    sse = 0
    cluster_sizes = {}
    for i in range(k):
        cluster_points = X[cluster_assignments == (i + 1)]  # Update cluster assignment index
        sse += np.sum((cluster_points - centroids[i]) ** 2)
        cluster_sizes[i + 1] = len(cluster_points)  # Update cluster assignment index

    results.append((k, max_iterations, sse, cluster_sizes))
    return results

In [7]:
max_k = 5
max_iterations = 100

start_time = time.time()  # Record the starting time

log_file_path = 'kmeans_cluster_log.txt'  # Replace with the actual path to the log file
with open(log_file_path, 'w') as log_file:

    # Loop through each value of K
    for k in range(1, max_k + 1):
        # Calculate the sum of cluster sizes for the current K with max_iterations=100
        results = kmeans_cluster(cleaned_tweet_content, k, max_iterations)

        # Print the results to console for the current K
        print(f"\nSummary for K = {k}")
        print("K\t\tSSE\t\tCluster Sizes")
        print("-" * 70)
        for _, _, sse, cluster_sizes in results:
            print(f"{k}\t\t{sse:.2f}\t\t{cluster_sizes}")

        # Write the results to the log file for the current K
        log_file.write(f"\nSummary for K = {k}\n")
        log_file.write("K\t\tSSE\t\tCluster Sizes\n")
        log_file.write("-" * 70 + "\n")
        for _, _, sse, cluster_sizes in results:
            log_file.write(f"{k}\t\t{sse:.2f}\t\t{cluster_sizes}\n")

    end_time = time.time()  # Record the ending time
    execution_time = end_time - start_time
    # Get the current UTC time
    current_utc_time = datetime.now(pytz.utc)
    # Set the desired time zone as 'America/Chicago' (Dallas)
    desired_time_zone = pytz.timezone('America/Chicago')
    # Convert the UTC time to the desired time zone
    current_datetime = current_utc_time.astimezone(desired_time_zone)
    print("Execution time: {:.2f} seconds".format(execution_time))
    print("Current date and time:", current_datetime)

    log_file.write(f"\nExecution time: {execution_time:.2f} seconds\n")
    log_file.write(f"Current date and time: {current_datetime}\n")


Summary for K = 1
K		SSE		Cluster Sizes
----------------------------------------------------------------------
1		1988.39		{1: 2000}

Summary for K = 2
K		SSE		Cluster Sizes
----------------------------------------------------------------------
2		1982.58		{1: 1880, 2: 120}

Summary for K = 3
K		SSE		Cluster Sizes
----------------------------------------------------------------------
3		1971.55		{1: 1683, 2: 118, 3: 199}

Summary for K = 4
K		SSE		Cluster Sizes
----------------------------------------------------------------------
4		1962.20		{1: 155, 2: 74, 3: 1455, 4: 316}

Summary for K = 5
K		SSE		Cluster Sizes
----------------------------------------------------------------------
5		1956.55		{1: 154, 2: 74, 3: 1402, 4: 308, 5: 62}
Execution time: 85.34 seconds
Current date and time: 2023-08-03 15:09:21.774225-05:00
