<a href="https://colab.research.google.com/github/Robertvaswegen/COS802/blob/main/NLPPROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
!pip install tweet-preprocessor bertopic nltk emoji transformers spacy geopy hdbscan umap-learn
!python -m spacy download en_core_web_sm
!pip install contractions

import pandas as pd
import re
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
import torch
from transformers import pipeline
import spacy
from geopy.geocoders import Nominatim
from tqdm import tqdm
import preprocessor as p
import contractions
import html

In [None]:

nltk.download('stopwords')
nltk.download('wordnet')
stopwords_en = set(stopwords.words('english')).union({'like', 'just', 'get', 'got'})
lemmatizer = WordNetLemmatizer()


def clean_text(text):
    cleaned = p.clean(text)
    cleaned = contractions.fix(cleaned)
    cleaned = re.sub(r'#', '', cleaned)
    cleaned = re.sub(r'[^\w\s]', '', cleaned)
    cleaned = re.sub(r'\s+', ' ', cleaned)
    cleaned = re.sub(r"https?://\S+|www\.\S+", "", cleaned)
    cleaned = re.sub(r"@\w+", "", cleaned)
    cleaned = html.unescape(cleaned)
    return cleaned.lower().strip()


def remove_stopwords_and_lemmatize(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords_en])


data['text_clean'] = data['text'].apply(clean_text).apply(remove_stopwords_and_lemmatize)


umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.1, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words=list(stopwords_en), ngram_range=(1, 2))   # Including bigrams
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
ctfidf_model = ClassTfidfTransformer()
representation_model = KeyBERTInspired()


topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    representation_model=representation_model
)


data_en = data[data['lang'] == 'en'].copy()


all_topics, all_probabilities = [], []
batch_size = 50000

for i in tqdm(range(0, len(data_en), batch_size)):
    data_batch = data_en['text_clean'].iloc[i : i + batch_size]
    topics, probabilities = topic_model.fit_transform(data_batch.tolist())
    all_topics.extend(topics)
    all_probabilities.extend(probabilities)


data_en['topic'] = all_topics
data_en['topic_probability'] = all_probabilities


unclassified_tweets = data_en[data_en['topic'] == -1]


data_en['topic'] = all_topics
data_en['topic_probability'] = all_probabilities
data_en.to_parquet('/content/drive/My Drive/sapoliceservice_topics_latest.parquet')


In [None]:

if len(unclassified_tweets) > 0:
    unclassified_texts = unclassified_tweets['text_clean'].tolist()
    refined_umap_model = UMAP(n_neighbors=5, n_components=5, min_dist=0.1, metric='cosine')
    refined_hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='leaf')


    refined_topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=refined_umap_model,
        hdbscan_model=refined_hdbscan_model,
        vectorizer_model=vectorizer_model,
        ctfidf_model=ctfidf_model,
        representation_model=representation_model
    )


    refined_topics_all, refined_probabilities_all = [], []
    refined_batch_size = 10000

    for i in tqdm(range(0, len(unclassified_texts), refined_batch_size)):
        unclassified_batch = unclassified_texts[i : i + refined_batch_size]
        refined_topics, refined_probabilities = refined_topic_model.fit_transform(unclassified_batch)
        refined_topics_all.extend(refined_topics)
        refined_probabilities_all.extend(refined_probabilities)


    unclassified_tweets['refined_topic'] = refined_topics_all
    unclassified_tweets['refined_topic_probability'] = refined_probabilities_all


    unclassified_tweets.to_parquet('/content/drive/My Drive/sapoliceservice_topics_unclassified_tweets_latest.parquet')


In [None]:
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import numpy as np


processed_texts = data_en['text_clean'].apply(lambda x: x.split()).tolist()
dictionary = Dictionary(processed_texts)


topic_words = [
    [word for word, _ in topic_model.get_topic(topic_id)]
    for topic_id in topic_model.get_topics()
    if topic_id != -1
]


filtered_topics = [topic for topic in topic_words if len(topic) > 0]

if filtered_topics:
    topic_words_ids = [
        [dictionary.token2id[word] for word in topic if word in dictionary.token2id]
        for topic in filtered_topics
    ]

    topic_words_ids = [topic for topic in topic_words_ids if len(topic) > 0]


    if topic_words_ids:
        coherence_model_npmi = CoherenceModel(
            topics=topic_words_ids,
            texts=processed_texts,
            dictionary=dictionary,
            coherence='c_npmi'
        )
        topic_coherence_npmi = coherence_model_npmi.get_coherence()

        coherence_model_cv = CoherenceModel(
            topics=topic_words_ids,
            texts=processed_texts,
            dictionary=dictionary,
            coherence='c_v'
        )
        topic_coherence_cv = coherence_model_cv.get_coherence()


        print(f"Topic Coherence (NPMI): {topic_coherence_npmi if not np.isnan(topic_coherence_npmi) else 'N/A'}")
        print(f"Topic Coherence (c_v): {topic_coherence_cv}")

    else:
        print("No valid topics found for coherence calculation.")
else:
    print("No non-empty topics available.")


if filtered_topics:
    unique_words = set(word for topic in filtered_topics for word in topic)
    total_words = sum(len(topic) for topic in filtered_topics)
    topic_diversity = len(unique_words) / total_words
    print(f"Topic Diversity: {topic_diversity}")
else:
    print("No valid topics found for diversity calculation.")


In [None]:
# Visualize topic intertopic distance
topic_model.visualize_topics()


In [None]:
topic_model.visualize_barchart()

In [None]:
from wordcloud import WordCloud

# Generate and plot word clouds for each topic
for topic_id in range(len(topic_words)):
    words = " ".join(topic_words[topic_id])
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate(words)

    plt.figure(figsize=(8, 4))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Topic {topic_id}")
    plt.show()


In [None]:

from gensim import corpora
from gensim.models import LdaModel

dictionary = corpora.Dictionary([text.split() for text in data_en['text_clean']])


corpus = [dictionary.doc2bow(text.split()) for text in data_en['text_clean']]


lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, passes=15, random_state=42)


coherence_model_lda = CoherenceModel(model=lda_model, texts=[text.split() for text in data_en['text_clean']], dictionary=dictionary, coherence='c_v')

coherence_score = coherence_model_lda.get_coherence()

print(f"Coherence Score (c_v): {coherence_score}")


In [None]:

!pip install pyLDAvis

import pyLDAvis
import pyLDAvis.gensim_models

lda_vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

pyLDAvis.display(lda_vis)



In [None]:
# Sentiment Analysis with RoBERTa
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    device=0 if torch.cuda.is_available() else -1
)

all_sentiments, batch_size = [], 50000
for i in tqdm(range(0, len(data_en), batch_size)):
    batch = data_en['text_clean'].iloc[i:i + batch_size].tolist()
    sentiments = sentiment_pipeline(batch, padding=True, truncation=True, max_length=128)
    batch_sentiments = [result['label'] for result in sentiments]
    all_sentiments.extend(batch_sentiments)

data_en['sentiment'] = all_sentiments
data_en.to_parquet('/content/drive/My Drive/sapoliceservice_tweets_with_sentiments.parquet')

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


sentiment_output = pd.read_parquet('/content/drive/MyDrive/sapoliceservice_tweets_with_sentiments.parquet')


label_mapping = {'positive': 1, 'neutral': 0, 'negative': -1}
sentiment_output['sentiment_label'] = sentiment_output['sentiment'].map(label_mapping)


sentiment_output['predicted_sentiment_label'] = sentiment_output['sentiment'].map(label_mapping)


accuracy = accuracy_score(sentiment_output['sentiment_label'], sentiment_output['predicted_sentiment_label'])
precision = precision_score(sentiment_output['sentiment_label'], sentiment_output['predicted_sentiment_label'], average='weighted')
recall = recall_score(sentiment_output['sentiment_label'], sentiment_output['predicted_sentiment_label'], average='weighted')
f1 = f1_score(sentiment_output['sentiment_label'], sentiment_output['predicted_sentiment_label'], average='weighted')
conf_matrix = confusion_matrix(sentiment_output['sentiment_label'], sentiment_output['predicted_sentiment_label'])


print(f"Sentiment Analysis Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
print("Confusion Matrix:\n", conf_matrix)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

conf_matrix = confusion_matrix(sentiment_output['sentiment_label'], sentiment_output['predicted_sentiment_label'])
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negative', 'Neutral', 'Positive'],
            yticklabels=['Negative', 'Neutral', 'Positive'])
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')
plt.show()


In [None]:
sentiment_counts = sentiment_output['sentiment'].value_counts()
plt.figure(figsize=(8, 6))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette="coolwarm")
plt.title("Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Number of Tweets")
plt.show()


In [None]:
sentiment_output['created_at'] = pd.to_datetime(sentiment_output['created_at'])
sentiment_output.set_index('created_at', inplace=True)
sentiment_over_time = sentiment_output.resample('W')['sentiment'].value_counts().unstack()
sentiment_over_time.plot(kind='line', figsize=(12, 6))
plt.title("Sentiment Over Time")
plt.xlabel("Date")
plt.ylabel("Sentiment Count")
plt.show()


In [None]:
#NER
subset_tweets = data_en.sample(n=100000, random_state=18014519)

from tqdm import tqdm
tqdm.pandas()

def extract_location_and_geocode(text):
    doc = nlp(text)
    locations = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
    if locations:
        for location in locations:
            try:
                geo_info = geolocator.geocode(location, timeout=10)
                if geo_info:
                    return geo_info.latitude, geo_info.longitude, location
            except Exception as e:
                print(f"Error geocoding {location}: {e}")
                continue
    return None, None, None


subset_tweets[['latitude', 'longitude', 'extracted_location']] = subset_tweets['text_clean'].progress_apply(
    lambda x: pd.Series(extract_location_and_geocode(x))
)

subset_tweets.to_parquet('/content/drive/My Drive/sapoliceservice_subset_with_locations.parquet')


In [None]:
import folium

# Initialize map centered on South Africa (or appropriate center)
m = folium.Map(location=[-30.5595, 22.9375], zoom_start=5)

# Add markers for each tweet with geolocation
for _, row in geolocation_output.dropna(subset=['latitude', 'longitude']).iterrows():
    folium.Marker([row['latitude'], row['longitude']], popup=row['extracted_location']).add_to(m)

# Save the map as an HTML file or display in a notebook
m.save("geolocation_map.html")
m
