# To install all required dependencies, run
          !pip install -r requirements.txt

In [None]:
! pip install -r requirements.txt

# Clustering and Categorization Script

## Prerequisites

Ensure that Python and the required libraries are installed. You can install them using:

```bash
pip install pandas nltk scikit-learn
```


In [5]:
import os
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import nltk

# Clustering and Categorization Script

## Features

- Reads CSV files containing subtitle text and timestamps.
- Preprocesses the text by converting to lowercase, removing punctuation, and eliminating stopwords.
- Uses TF-IDF vectorization to convert text data into numerical representations.
- Applies K-Means clustering to categorize workflow steps into predefined categories.
- Maps detected clusters to meaningful workflow step names.
- Saves the categorized data into new CSV files in the output directory.

In [None]:

nltk.download('stopwords')    
nltk.download('punkt')


input_dir = '/Users/nvaishnavi/Documents/Instructional_video_analysis/4_all_CSV'
output_dir = '/Users/nvaishnavi/Documents/Instructional_Video_analysis/5_clustered_categorized_file'
os.makedirs(output_dir, exist_ok=True)

# Preprocess text
def preprocess_text(text):
    #  
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in tokens if word not in stop_words])

custom_mapping = {
    0: "Introduction",
    1: "Step-by-Step Instruction",
    2: "Context Setting",
    3: "Transitions",
    4: "Recap",
    5: "Practical Application",
    6: "Conclusion"
}

# Process 3_all_CSV files 
for filename in os.listdir(input_dir):
    if filename.endswith('.csv'):  # Process only CSV files
        print(f"Processing file: {filename}")

        
        file_path = os.path.join(input_dir, filename)
        df = pd.read_csv(file_path)

      
        df['Cleaned Text'] = df['Text'].apply(preprocess_text)

       
        vectorizer = TfidfVectorizer(max_features=1000)
        X = vectorizer.fit_transform(df['Cleaned Text'])

        
        n_clusters = 7 
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        df['Cluster'] = kmeans.fit_predict(X)

        
        df['Workflow Step'] = df['Cluster'].map(custom_mapping)

        output_file = os.path.join(output_dir, filename)
        df.to_csv(output_file, index=False)

        print(f"File with refined mapping saved to: {output_file}")

print("All files processed.")
