In [2]:
!pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting regex>=2021.8.3 (from nltk)
  Using cached regex-2024.11.6-cp311-cp311-win_amd64.whl.metadata (41 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Using cached regex-2024.11.6-cp311-cp311-win_amd64.whl (274 kB)
Installing collected packages: regex, nltk
Successfully installed nltk-3.9.1 regex-2024.11.6


In [13]:
import os
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def load_text_files(folder_path):
    texts = []
    filenames = []
    for file in os.listdir(folder_path):
        if file.endswith(".txt"):
            with open(os.path.join(folder_path, file), 'r', encoding='utf-8') as f:
                content = f.read()
                texts.append(content)
                filenames.append(file)  # Only store the filename
    return filenames, texts

def preprocess_text(texts):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    processed_texts = []
    for text in texts:
        tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
        processed_texts.append(" ".join(tokens))
    
    return processed_texts

def cluster_texts(texts, num_clusters=2):
    vectorizer = TfidfVectorizer(max_features=5000)
    X = vectorizer.fit_transform(texts)
    
    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X)

    # Remap the labels: 0 -> 1, 1 -> 2 (adjust based on actual cluster interpretation)
    remapped_labels = [label + 1 for label in labels]  # Change 0 -> 1, 1 -> 2

    return remapped_labels

def save_label_studio_format(filenames, texts, labels, output_file):
    label_map = {1: "finance", 2: "film"}  # Adjust based on actual cluster interpretation
    labeled_data = []
    
    for filename, content, label in zip(filenames, texts, labels):
        content = content.replace("\n", " ")  # Replace newlines with space or another separator
        labeled_data.append({
            "data": {
                "text": content  # Make sure the 'text' field is present here
            },
            "annotations": [{
                "result": [{
                    "value": {
                        "choices": [label_map[label]]  # Make sure the 'choices' is a list
                    },
                    "from_name": "sentiment",  # Specify the label name field
                    "to_name": "text",  # Specify the to_name field that corresponds to 'data'
                    "type": "choices",  # Label Studio expects this type for simple classification
                }]
            }]
        })
    
    # Save the data to a JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(labeled_data, f, indent=4)

def main():
    folder_path = "./merged_folder"
    output_file = "label_studio_input.json"
    
    filenames, texts = load_text_files(folder_path)
    preprocessed_texts = preprocess_text(texts)
    labels = cluster_texts(preprocessed_texts)
    save_label_studio_format(filenames, texts, labels, output_file)
    
    print(f"Labeled data saved to {output_file}")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to C:\Users\Sanskruti
[nltk_data]     Jajoo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Sanskruti
[nltk_data]     Jajoo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Sanskruti
[nltk_data]     Jajoo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Labeled data saved to label_studio_input.json
