In [None]:
import os
import json
import pandas as pd

transcription_folder = "/content/drive/MyDrive/Kareem_Esmail/raw data"
metadata_folder = "/content/drive/MyDrive/Kareem_Esmail/metadata"

transcription_files = sorted(os.listdir(transcription_folder))
metadata_files = sorted(os.listdir(metadata_folder))

data = []

for txt_file, json_file in zip(transcription_files, metadata_files):
    txt_path = os.path.join(transcription_folder, txt_file)
    json_path = os.path.join(metadata_folder, json_file)

    with open(txt_path, "r", encoding="utf-8") as f:
        transcription = f.read().strip()

    with open(json_path, "r", encoding="utf-8") as f:
        metadata = json.load(f)

    podcast_data = {
        "title": metadata.get("title", "Unknown"),
        "author": metadata.get("author", "Unknown"),
        "categories": ", ".join(metadata.get("categories", [])),  # Convert list to string
        "keywords": ", ".join(metadata.get("keywords", [])),  # Convert list to string
        "source": metadata.get("source_url", "Unknown"),
        "publish_date": metadata.get("publish_date", "Unknown"),
        "length": metadata.get("length", "Unknown"),
        "type": metadata.get("type", "Unknown"),
        "transcription": transcription
    }

    data.append(podcast_data)

df = pd.DataFrame(data)

df.head()

In [None]:
import pandas as pd

def chunk_by_lines(text, chunk_size=3):
    """Chunks text based on number of lines."""
    lines = text.split("\n")  # Split by newlines
    return [" ".join(lines[i:i+chunk_size]) for i in range(0, len(lines), chunk_size)]

def chunk_by_words(text, chunk_size=50):
    """Chunks text based on number of words."""
    words = text.split()  # Split by spaces (since no punctuation)
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

# Apply chunking (choose one)
df["chunks"] = df["transcription"].apply(lambda x: chunk_by_lines(x, chunk_size=3))  # Chunk by lines
#df["chunks"] = df["clean_transcription"].apply(lambda x: chunk_by_words(x, chunk_size=50))  # Chunk by words

# Explode to separate rows
df = df.explode("chunks")

print(df.head())


In [None]:
import pandas as pd

def chunk_by_lines(text, chunk_size=3):
    """Chunks text based on number of lines."""
    lines = text.split("\n")  # Split by newlines
    return [" ".join(lines[i:i+chunk_size]) for i in range(0, len(lines), chunk_size)]

def chunk_by_words(text, chunk_size=50):
    """Chunks text based on number of words."""
    words = text.split()  # Split by spaces (since no punctuation)
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

# Apply chunking (choose one)
df["chunks"] = df["transcription"].apply(lambda x: chunk_by_lines(x, chunk_size=3))  # Chunk by lines
#df["chunks"] = df["clean_transcription"].apply(lambda x: chunk_by_words(x, chunk_size=50))  # Chunk by words

# Explode to separate rows
df = df.explode("chunks")

print(df.head())


In [None]:
import re

def normalize_arabic(text):
    text = re.sub(r"[\u064B-\u065F]", "", text)  # Remove diacritics
    text = re.sub(r"[إأآ]", "ا", text)  # Normalize Alef variations
    text = re.sub(r"ؤ", "و", text)  # Normalize Waw
    text = re.sub(r"ئ", "ي", text)  # Normalize Yeh
    text = re.sub(r"ة", "ه", text)  # Convert Teh Marbuta to Heh
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    return text

In [None]:
import nltk
from nltk.corpus import stopwords

# ✅ Download Arabic stopwords from NLTK
nltk.download('stopwords')

# ✅ Load Arabic stopwords
arabic_stopwords = set(stopwords.words('arabic'))

# ✅ Function to remove stopwords
def remove_stopwords(text):
    words = text.split()
    clean_words = [word for word in words if word not in arabic_stopwords]
    return " ".join(clean_words)

In [None]:
print(df.head())  # Show first few rows
print(df.info())  # Check data types and missing values
print(df.describe())

In [None]:
df["clean_transcription"] = df["chunks"].apply(lambda x: remove_stopwords(normalize_arabic(x)))

# ✅ Display first cleaned texts
df[["chunks", "clean_transcription"]].head()

In [None]:
# Expand each chunk into a separate row
df_expanded = df.explode("chunks")

# Display first few rows
df_expanded.head()
# Install openpyxl for Excel export
!pip install openpyxl

# Save DataFrame to an Excel file
df_expanded.to_excel("podcast_chunks.xlsx", index=False)

# Download the file in Google Colab
from google.colab import files
files.download("podcast_chunks.xlsx")


In [None]:
from collections import Counter

all_words = " ".join(df["clean_transcription"]).split()
word_freq = Counter(all_words)

print(word_freq.most_common(20))

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
!wget https://noto-website-2.storage.googleapis.com/pkgs/NotoNaskhArabic-unhinted.zip
!unzip NotoNaskhArabic-unhinted.zip -d /usr/share/fonts/truetype/
wordcloud = WordCloud(font_path="/usr/share/fonts/truetype/NotoNaskhArabic-Regular.ttf", width=800, height=400).generate(" ".join(df["clean_transcription"]))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
import nltk
nltk.download('punkt_tab')
from nltk.util import ngrams
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter

def extract_ngrams(text, n):
    tokens = word_tokenize(text)  # Tokenize words
    n_grams = list(ngrams(tokens, n))  # Generate n-grams
    return n_grams
all_text = " ".join(df["clean_transcription"])  # Combine all podcasts into one string
bigrams = extract_ngrams(all_text, 2)
trigrams = extract_ngrams(all_text, 3)

bigram_counts = Counter(bigrams)
trigram_counts = Counter(trigrams)

print("🔹 Top 10 Bigrams:")
print(bigram_counts.most_common(10))

print("\n🔹 Top 10 Trigrams:")
print(trigram_counts.most_common(10))

In [None]:
!pip install arabic-reshaper
!pip install python-bidi

In [None]:
import arabic_reshaper
from bidi.algorithm import get_display
import matplotlib.pyplot as plt

def reshape_arabic(text_list):
    return [get_display(arabic_reshaper.reshape(" ".join(w))) for w in text_list]

bigram_words = reshape_arabic([w for w, _ in bigram_counts.most_common(10)])
trigram_words = reshape_arabic([w for w, _ in trigram_counts.most_common(10)])

bigram_freqs = [freq for _, freq in bigram_counts.most_common(10)]
trigram_freqs = [freq for _, freq in trigram_counts.most_common(10)]

plt.figure(figsize=(10, 5))
plt.barh(bigram_words, bigram_freqs, color='skyblue')
plt.xlabel("التكرار")
plt.ylabel("ثنائيات الكلمات")
plt.title("أكثر 10 ثنائيات شيوعًا في مجموعة البيانات")
plt.gca().invert_yaxis()
plt.gca().invert_xaxis()  # Fix the RTL direction
plt.show()

plt.figure(figsize=(10, 5))
plt.barh(trigram_words, trigram_freqs, color='salmon')
plt.xlabel("التكرار")
plt.ylabel("ثلاثيات الكلمات")
plt.title("أكثر 10 ثلاثيات شيوعًا في مجموعة البيانات")
plt.gca().invert_yaxis()
plt.gca().invert_xaxis()  # Fix the RTL direction
plt.show()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=500)  # Limit to top 500 important words
tfidf_matrix = vectorizer.fit_transform(df["clean_transcription"])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Display the TF-IDF matrix
print(tfidf_df.head())

In [None]:
# Install CAMeL Tools with GitHub link as a last resort
!pip install git+https://github.com/CAMeL-Lab/camel_tools.git


In [None]:
import os
import requests
import zipfile

# Define the URL for the morphology database
url = "https://github.com/CAMeL-Lab/camel-tools-data/releases/download/2022.03.21/morphology_db_calima-msa-r13-0.4.0.zip"

# Define the target directory
target_dir = os.path.expanduser("~/.camel_tools/data/morphology_db/calima-msa-r13/")

# Create the target directory if it doesn't exist
os.makedirs(target_dir, exist_ok=True)

# Download the zip file
zip_path = "/tmp/morphology_db_calima-msa-r13.zip"
response = requests.get(url)
with open(zip_path, "wb") as f:
    f.write(response.content)

# Extract the zip file into the target directory
with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(target_dir)

# Clean up the zip file
os.remove(zip_path)

print("Morphology database downloaded and extracted successfully.")


In [None]:
from camel_tools.morphology.database import MorphologyDB
from camel_tools.morphology.analyzer import Analyzer
from camel_tools.utils.dediac import dediac_ar
from camel_tools.tokenizers.word import simple_word_tokenize
import pandas as pd
import re

# Load the built-in MorphologyDB for Modern Standard Arabic (MSA)
db = MorphologyDB.builtin_db()
analyzer = Analyzer(db)

def preprocess_arabic_camel(text):
    # # Normalize Arabic text
    # text = re.sub(r"[إأآا]", "ا", text)
    # text = re.sub(r"ى", "ي", text)
    # text = re.sub(r"ة", "ه", text)
    # text = re.sub(r"ؤ", "و", text)
    # text = re.sub(r"ئ", "ي", text)
    # text = dediac_ar(text)  # Remove diacritics

    # Tokenize
    tokens = simple_word_tokenize(text)

    # Lemmatize using CAMeL Analyzer
    lemmas = []
    for token in tokens:
        analyses = analyzer.analyze(token)
        if analyses:
            # Extract lemma from 'lex' field, prefer nouns/verbs
            lemma = None
            for a in analyses:
                pos = a.get('pos', '')
                lex = a.get('lex', token)
                if pos in ['noun', 'verb']:  # Prefer nouns and verbs
                    lemma = lex
                    break
            # Fallback if no noun/verb found
            lemma = lemma or analyses[0].get('lex', token)
            lemmas.append(lemma)
        else:
            lemmas.append(token)

    return " ".join(lemmas)

# Apply preprocessing
df["processed_text"] = df["clean_transcription"].apply(preprocess_arabic_camel)

# Preview result
print(df[["clean_transcription", "processed_text"]])


In [None]:
!pip install transformers camel-tools

from transformers import AutoTokenizer, AutoModel
from camel_tools.morphology.database import MorphologyDB
from camel_tools.morphology.analyzer import Analyzer
from camel_tools.utils.dediac import dediac_ar
import pandas as pd
import torch

# Load MARBERT Tokenizer and Model
marbert_model_name = "UBC-NLP/MARBERT"
tokenizer = AutoTokenizer.from_pretrained(marbert_model_name)
model = AutoModel.from_pretrained(marbert_model_name)

# Load CAMeL Tools Analyzer
db = MorphologyDB.builtin_db()
analyzer = Analyzer(db)

def marbert_tokenize(text):
    # Tokenize using MARBERT
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128, padding='max_length')
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    return tokens

def camel_lemmatize(tokens):
    # Lemmatize tokens using CAMeL Tools
    lemmas = []
    for token in tokens:
        token = dediac_ar(token)  # Remove diacritics
        if token.startswith("##") or token in ['[CLS]', '[SEP]', '[PAD]']:  # Skip subwords and special tokens
            continue
        analyses = analyzer.analyze(token)
        if analyses:
            # Pick the first lemma from the analyses
            lemma = analyses[0].get('lex', token)
            lemmas.append(lemma)
        else:
            lemmas.append(token)
    return lemmas

def preprocess_arabic(text):
    # Tokenize with MARBERT
    tokens = marbert_tokenize(text)
    # Lemmatize with CAMeL Tools
    lemmas = camel_lemmatize(tokens)
    return lemmas

# # Sample DataFrame
# data = {'text': ["اللغة العربية جميلة ومعقدة.", "الطلاب يدرسون في الجامعة."]}
# df = pd.DataFrame(data)

# Apply preprocessing
df["lemmatized_text"] = df["clean_transcription"].apply(preprocess_arabic)

# Preview result
print(df[["clean_transcription", "lemmatized_text"]])


In [None]:
!pip install transformers sentence-transformers scikit-learn

In [None]:
from sentence_transformers import SentenceTransformer

# Load pre-trained Arabic BERT model
model = SentenceTransformer("aubmindlab/bert-base-arabertv2")

# Convert all chunks to embeddings
df["embeddings"] = df["chunks"].apply(lambda x: model.encode(x))
df.explode("embeddings").head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["chunks"])  # Ensure it's on chunked text

# Apply KMeans clustering
num_clusters = 7  # Adjust based on your use case
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df["chunk_cluster"] = kmeans.fit_predict(X)
df = df.explode("chunks")  # Ensure each chunk is in a separate row
df = df.dropna(subset=["chunks"])  # Remove empty values if any

In [None]:
for i in range(num_clusters):
    print(f"\n🔹 Cluster {i} Sample Chunks:")
    print(df[df["chunk_cluster"] == i]["chunks"].head(5).tolist())

In [None]:
cluster_mappings = {
    0: {"theme": "Inspirational", "emotion": "Peace"},
    1: {"theme": "Encouraging", "emotion": "Motivation"},
    2: {"theme": "Reassuring", "emotion": "Anxiety"},
    3: {"theme": "Reflective", "emotion": "Calm"},
    4: {"theme": "Challenging", "emotion": "Self-Doubt"},
    5: {"theme": "Uplifting", "emotion": "Joy"},
    6: {"theme": "Grounded", "emotion": "Realism"}
}
df["theme"] = df["chunk_cluster"].map(lambda x: cluster_mappings[x]["theme"])
df["emotion"] = df["chunk_cluster"].map(lambda x: cluster_mappings[x]["emotion"])

df.explode(["theme", "emotion"]).head()


In [None]:
theme_summary = df.groupby("title")["theme"].value_counts(normalize=True).unstack()
print("Theme Summary:\n", theme_summary.head())
emotion_summary = df.groupby("title")["emotion"].value_counts(normalize=True).unstack()
print("Emotion Summary:\n", emotion_summary.head())
summary = df.groupby("title")[["theme", "emotion"]].apply(lambda x: x.value_counts(normalize=True)).unstack()
print("Combined Summary:\n", summary.head())
