In [None]:
! pip install lughaatNLP

In [None]:
#useable code best approach

import pandas as pd
import re
from nltk import ConditionalFreqDist, bigrams, trigrams
from LughaatNLP import LughaatNLP
import random

# Initialize LughaatNLP for text processing
urdu_text_processing = LughaatNLP()

# Load the Urdu text data
file_path = 'urdu_stories.csv'  # Update with your file path
urdu_stories_df = pd.read_csv(file_path, header=None)
urdu_stories_df.columns = ['story_name', 'story_content']

# Tokenize the stories, normalize the text, and extract starting words
def tokenize_and_extract_starting_words(content):
    normalized_text = urdu_text_processing.normalize(content)  # Normalize the content
    sentences = re.split(r'[؟.-]', normalized_text)

    starting_words = []
    tokens = []

    for sentence in sentences:
        if sentence.strip():  # Avoid empty sentences
            sentence_tokens = urdu_text_processing.urdu_tokenize(sentence)
            if sentence_tokens:  # Check if there are tokens
                starting_words.append(sentence_tokens[0])  # Store the first word of the sentence
                tokens.extend(sentence_tokens)  # Collect all tokens for bigrams/trigrams

    return tokens, starting_words

# Apply tokenization and extract starting words
urdu_stories_df['tokens'], urdu_stories_df['starting_words'] = zip(*urdu_stories_df['story_content'].apply(tokenize_and_extract_starting_words))

# Generate bigrams and trigrams from tokens
urdu_stories_df['bigrams'] = urdu_stories_df['tokens'].apply(lambda tokens: list(bigrams(tokens)))
urdu_stories_df['trigrams'] = urdu_stories_df['tokens'].apply(lambda tokens: list(trigrams(tokens)))

# Create Conditional Frequency Distribution for bigrams and trigrams with Laplace smoothing
bigrams_list = [bigram for story_bigrams in urdu_stories_df['bigrams'] for bigram in story_bigrams]
trigrams_list = [trigram for story_trigrams in urdu_stories_df['trigrams'] for trigram in story_trigrams]

cfd_bigrams = ConditionalFreqDist(bigrams_list)
cfd_trigrams = ConditionalFreqDist((w1 + ' ' + w2, w3) for w1, w2, w3 in trigrams_list)

# Laplace smoothing function for bigrams
def laplace_smoothing(cfd, word1, word2, vocab_size):
    return (cfd[word1][word2] + 1) / (cfd[word1].N() + vocab_size)

# Laplace smoothing function for trigrams
def laplace_smoothing_trigram(cfd, w1, w2, w3, vocab_size):
    return (cfd[(w1, w2)][w3] + 1) / (cfd[(w1, w2)].N() + vocab_size)

# Collect all unique starting words from all stories into a single pool
all_starting_words = list(set(word for sublist in urdu_stories_df['starting_words'] for word in sublist))
vocab_size = len(set(urdu_stories_df['tokens'].sum()))  # Total unique tokens for smoothing

# Define a set of valid end words for sentences
valid_end_words = {'آخر', 'پہلے', 'ختم', 'تک', 'بات', 'یہ', 'کیا', 'ہے', 'تھی', 'کرنا', 'گا', 'جاتا', 'پر', 'ہوں'}  # Add more words as needed

# Function to generate sentences using trigrams with improved coherence
def generate_sentence(cfd_model_bigram, cfd_model_trigram, start_word, length=10, used_words=set()):
    sentence = [start_word]

    for _ in range(length - 1):
        # Attempt to use trigram model
        if len(sentence) >= 2:  # Ensure we have at least two words for the trigram
            bigram_key = ' '.join(sentence[-2:])  # Get the last two words for the trigram
            next_word_probabilities = {word: laplace_smoothing_trigram(cfd_model_trigram, sentence[-2], sentence[-1], word, vocab_size)
                                        for word in cfd_model_trigram[bigram_key]}
            next_word = max(next_word_probabilities, key=next_word_probabilities.get)  # Get the most probable next word

            if next_word not in used_words:  # Check if the word has already been used
                sentence.append(next_word)
                used_words.add(next_word)  # Add to used words

                # Check if the next word is a valid end word
                if next_word in valid_end_words:
                    break  # End sentence if a valid end word is found
                continue  # Move to the next iteration

        # Fallback to bigram model
        if len(sentence) >= 1:
            last_word = sentence[-1]  # Get the last word for the bigram
            next_word_probabilities = {word: laplace_smoothing(cfd_model_bigram, last_word, word, vocab_size)
                                        for word in cfd_model_bigram[last_word]}
            next_word = max(next_word_probabilities, key=next_word_probabilities.get)  # Get the most probable next word

            if next_word not in used_words:  # Check if the word has already been used
                sentence.append(next_word)
                used_words.add(next_word)  # Add to used words

                # Check if the next word is a valid end word
                if next_word in valid_end_words:
                    break  # End sentence if a valid end word is found
            else:
                # Find the next highest probable word
                next_candidates = [word for word in cfd_model_bigram[last_word].keys() if word not in used_words]
                if next_candidates:
                    next_word = random.choice(next_candidates)  # Select a random candidate from available options
                    sentence.append(next_word)
                    used_words.add(next_word)  # Add to used words

                    # Check if the next word is a valid end word
                    if next_word in valid_end_words:
                        break  # End sentence if a valid end word is found
                else:
                    break  # Stop if no next word is found

    return ' '.join(sentence)

# Function to generate paragraphs using the combined pool of starting words, maintaining coherence
def generate_paragraphs(num_paragraphs=3):
    for _ in range(num_paragraphs):
        paragraph = []
        used_words = set()  # Track used words in the paragraph

        for _ in range(random.randint(5, 19)):  # Random number of sentences between 5 and 19
            if paragraph:  # If there are already sentences in the paragraph
                last_sentence = paragraph[-1]
                last_word = last_sentence.split()[-1]  # Use the last word of the previous sentence
                start_word = get_starting_word_based_on_last(last_word, cfd_bigrams)
            else:
                start_word = random.choice(all_starting_words)  # Select a random starting word

            # If no valid starting word is found, choose a random one from all starting words
            if not start_word:
                start_word = random.choice(all_starting_words)

            # Generate the sentence using both bigram and trigram models
            sentence = generate_sentence(cfd_bigrams, cfd_trigrams, start_word, length=random.randint(5, 19), used_words=used_words)
            paragraph.append(sentence)

        # Print the generated paragraph
        print('\n'.join(paragraph))
        print("\n")  # Print an empty line after each paragraph

# Function to get potential starting words based on the last word using bigram model
def get_starting_word_based_on_last(last_word, cfd_model_bigram):
    possible_starting_words = []

    # Iterate through the keys in cfd_model_bigram to find matches
    for (w1, w2) in cfd_model_bigram.items():
        if w1 == last_word:
            possible_starting_words.extend(w2.keys())  # Extend with the keys (w2) for each matching w1

    return random.choice(possible_starting_words) if possible_starting_words else None

# Run the paragraph generation
generate_paragraphs()


جنگ میں کام کرنے کے بعد وہ پھر سے
لفنگوں، والی نہ تھی
ھوئی ۔ کیس چلا بالاخر یہ
اپیل اور رہائی قدرے آسان ہوتی ہے
گرلز کالج پہنچ گئے ہی نہیں سوچتے کہ چچا جان نے اس کی طرف دیکھا


پھر اس نے کہا ۔ میں حقہ گڑگڑاتا
رہا تھا دانیال بولتے ہوئے دراز قد شخص کو دیکھ کر وہ ایک بار پھر وکیلوں کی فیسیں کب
کون ہے
جین جلدی سے تھمے ہی نہیں،
پاکستان چلی گئیں تمہیں پہلی محبت چل رہی تھی
مہتاب علی صاحب کچھ الجھے رہو گے تین چار دن
شایان تم اب بھی نہیں بھولے بھالے اور اکلوتے صاحبزادے کے ساتھ لے پکڑ سو روپے
تو بتاؤ، اگر تجھ پرنہیں ڈالا کرتے تھے رمجو اٹھ سکتی ہوں
عادل نمائندے دونوں وہیں کھڑے وکیل سرکار ظہیر الدین بیمار پڑ گئے اپ ضرور لیں
پلیز مجھے جانے دو خاندان تھا، مختار احمد ٹھیک ہے، مرچیں لگ رہا ذہن معمولی بات
شدو مد نظر تھیں سونو کہاں
چوکتیں کیونکہ فصیح جب گائوں جانا ہم موت مرنے دیں شاید اسی لئے ٹریفک
سے جمیل کا تصور جاناں کئے بغیر خون اگلتا
ڈھیر ساری باتیں سن لیجئے، بھگوان آپ تشریف لارہے ہیں امریکہ پڑھنے
لگتی، جیسے اسے اپنے گھر اجڑ گیا رحیم میک کلس کھانے بیٹھا کسی عیب موجود نہ چلے،
ذرا سی ٹوپی رکھ دیا ا