In [63]:
import pandas as pd
import json
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Reading in and writing files
def load_data(file):
  with open(file, 'r', encoding='utf-8') as f:
    data = json.load(f)
  return data

def write_data(file, data):
  with open(file, "w", encoding = "utf-8") as f:
    json.dump(data, f, indent = 4)

We want to remove from descriptions:

1. stopwords
2. patterns like (AC/2000/142)
3. dates 

In [31]:
def remove_stops(text, stop_words):
  text = re.sub(r"AC\/\d{1,4}\/\d{1,4}", "", text)                               # removing patterns from desription such as : (AC/2000/147)
  words = text.split()
  final = [word for word in words if word not in stop_words]                     # Removing stopwords
  final = " ".join(final)
  final = final.translate(str.maketrans("","",string.punctuation))               # Removing Punctuations from the text
  final = "".join([i for i in final if not i.isdigit()])                       # Removing digits
  while "  " in final:
    final = final.replace("  ", " ")
  return final

def clean_docs(docs):
  stop_words = stopwords.words('english')
  
  # Removing months names
  months = load_data('months.json')
  #print(months)
  stops = stop_words + months

  doc = [remove_stops(doc, stops) for doc in docs]
  return doc

In [32]:
# Reading in data : data has description about violence and victim names
descriptions = load_data('trc_dn.json')['descriptions']
names = load_data('trc_dn.json')['names']

In [49]:
cleaned_docs = clean_docs(descriptions)
#print(cleaned_docs)
print(len(cleaned_docs))

21747


In [64]:
# Creating TF-IDF Vectorizer
vectorizer = TfidfVectorizer(lowercase=False,
                             max_features = 100,
                             min_df = 3,
                             ngram_range = (1,3),
                             stop_words = 'english')

vectors = vectorizer.fit_transform(cleaned_docs)

In [65]:
feature_names = vectorizer.get_feature_names()
#print(len(feature_names))

# Doc-word dense matrix
dense = vectors.todense()
#print(dense.shape)

denselist = dense.tolist()
# print(len(denselist))

In [66]:
# Here we are taking values which are greater that 0 (meaning corresponding words) from TFIDF matrix for each document

all_keywords = []
for row in denselist:
  #print(description)
  x = 0
  keywords = []
  for value in row:
    if value > 0.0:
      keywords.append(feature_names[x])
    x = x+1
  all_keywords.append(keywords)

# here we are looking at a complete description and important keywords for that description that we have calculated using TFIDF matrix.
print(descriptions[0])
print(all_keywords[0])

An ANCYL member who was shot and severely injured by SAP members at Lephoi, Bethulie, Orange Free State (OFS) on 17 April 1991. Police opened fire on a gathering at an ANC supporter's house following a dispute between two neighbours, one of whom was linked to the ANC and the other to the SAP and a councillor.
['ANC', 'ANC supporters', 'An', 'Police', 'SAP', 'house', 'injured', 'member', 'members', 'severely', 'shot', 'supporters']


In [72]:
# Now er eant to do the clustering:
# We want to take all those keywords fom TFIDF and cluster them using Kmeans
model = KMeans(n_clusters=20, init="k-means++", max_iter=100, n_init=1)
model.fit(vectors)

KMeans(max_iter=100, n_clusters=20, n_init=1)

In [73]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

# Saving  the cluters in a file
with open ("trc_cluster_results.txt", "w", encoding="utf-8") as f:
    for i in range(20):
        f.write(f"Cluster {i}")
        f.write("\n")
        for ind in order_centroids[i, :10]:
            f.write (' %s' % terms[ind],)
            f.write("\n")
        f.write("\n")
        f.write("\n")

Reference : https://www.youtube.com/watch?v=i74DVqMsRWY&list=PL2VXyKi-KpYttggRATQVmgFcQst3z6OlX&index=7

https://github.com/wjbmattingly/topic_modeling_textbook/blob/main/lessons/02_tf_idf_official.py