In [13]:
import markovify
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


1. Selecciona un cuerpo de texto de interés (extensión .txt). 

In [15]:
def read_text(file_path):
    """
    Reads the text from the specified file.

    Parameters:
    file_path (str): The path to the text file.

    Returns:
    str: The content of the file as a single string.
    """
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()

def save_text(file_path, text):
    """
    Saves the given text to a file.

    Parameters:
    file_path (str): The path to the file where the text will be saved.
    text (str): The text to save.
    """
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(text)

2. Genera un cuerpo de texto sintético utilizando herramientas como MarkovifyLinks to an external site. 

In [14]:
def generate_synthetic_text(model, num_sentences=100):
    """
    Generates synthetic text using a Markov model.

    Parameters:
    model (markovify.Text): The Markov model generated from the original text.
    num_sentences (int): The number of sentences to generate.

    Returns:
    str: The generated synthetic text as a single string.
    """
    synthetic_text = ""
    for _ in range(num_sentences):
        sentence = model.make_sentence()
        if sentence is not None:
            synthetic_text += sentence + " "
    return synthetic_text

3. Transforma el cuerpo de texto original y el sintético a una representación vectorial, por ejemplo tf–idf.

In [None]:
def transform_to_tfidf_vector(texts):
    """
    Transforms a list of texts into TF-IDF vectors.

    Parameters:
    texts (list of str): A list of texts to transform.

    Returns:
    scipy.sparse.csr.csr_matrix: The TF-IDF vectors.
    """
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    return tfidf_matrix

def calculate_cosine_similarity(tfidf_matrix):
    """
    Calculates the cosine similarity between two TF-IDF vectors.

    Parameters:
    tfidf_matrix (scipy.sparse.csr.csr_matrix): The TF-IDF matrix containing vectors of two texts.

    Returns:
    float: The cosine similarity score between the two texts.
    """
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return cosine_sim[0][0]

In [16]:
ORIGINAL_TEXT = read_text("texto.txt")
SYNTETIC_TEXT = "texto_sintetico.txt"

markov_model = markovify.Text(ORIGINAL_TEXT) # Build the Markov model
synthetic_text = generate_synthetic_text(markov_model, num_sentences=100)

save_text(SYNTETIC_TEXT, synthetic_text)