In [9]:
import markovify
import math
from collections import Counter
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import tokenize
from io import BytesIO


1. Selecciona un cuerpo de texto de interés (extensión .txt). 

In [10]:
def read_text(file_path):
    """
    Reads the text from the specified file.

    Parameters:
    file_path (str): The path to the text file.

    Returns:
    str: The content of the file as a single string.
    """
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()

def save_text(file_path, text):
    """
    Saves the given text to a file.

    Parameters:
    file_path (str): The path to the file where the text will be saved.
    text (str): The text to save.
    """
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(text)

2. Genera un cuerpo de texto sintético utilizando herramientas como MarkovifyLinks to an external site. 

In [11]:
def generate_synthetic_text(model, num_sentences=100):
    """
    Generates synthetic text using a Markov model.

    Parameters:
    model (markovify.Text): The Markov model generated from the original text.
    num_sentences (int): The number of sentences to generate.

    Returns:
    str: The generated synthetic text as a single string.
    """
    synthetic_text = ""
    for _ in range(num_sentences):
        sentence = model.make_sentence()
        if sentence is not None:
            synthetic_text += sentence + " "
    return synthetic_text

3. Transforma el cuerpo de texto original y el sintético a una representación vectorial, por ejemplo tf–idf.

In [12]:
def preprocess_text(text):
    """
    Preprocesses the text by converting to lowercase and removing stop words.

    Parameters:
    text (str): The text to preprocess.

    Returns:
    list: The preprocessed words in the text.
    """
    words = text.lower().split()
    words = [word for word in words if word not in ENGLISH_STOP_WORDS]
    return words

def compute_tf(word_dict, word_count):
    """
    Computes the term frequency for each word in the word dictionary.

    Parameters:
    word_dict (dict): A dictionary of words and their counts in the text.
    word_count (int): The total number of words in the text.

    Returns:
    dict: The term frequency for each word.
    """
    tf_dict = {}
    for word, count in word_dict.items():
        tf_dict[word] = count / float(word_count)
    return tf_dict

def compute_idf(documents):
    """
    Computes the inverse document frequency for each word in the documents.

    Parameters:
    documents (list): A list of lists, where each sublist contains the words in a document.

    Returns:
    dict: The inverse document frequency for each word.
    """
    N = len(documents)
    idf_dict = dict.fromkeys(documents[0], 0)
    for document in documents:
        for word in document:
            if word in idf_dict:
                idf_dict[word] += 1

    for word, val in idf_dict.items():
        idf_dict[word] = math.log(N / float(val))
    return idf_dict

def compute_tfidf(tf, idf):
    """
    Computes the TF-IDF score for each term in a document.

    Parameters:
    tf (dict): A dictionary mapping terms to their TF values.
    idf (dict): A dictionary mapping terms to their IDF values.

    Returns:
    dict: A dictionary mapping terms to their TF-IDF values.
    """
    tfidf = {}
    for word, val in tf.items():
        tfidf[word] = val * idf.get(word, 0.0)
    return tfidf

def transform_to_tfidf_vector(text, idf_dict):
    """
    Transforms the given text into a TF-IDF vector representation.

    Parameters:
    text (str): The text to transform.
    idf_dict (dict): The inverse document frequency for each word.

    Returns:
    dict: The TF-IDF vector representation of the text.
    """
    words = preprocess_text(text)
    word_count = len(words)
    word_dict = Counter(words)
    tf = compute_tf(word_dict, word_count)
    tfidf = compute_tfidf(tf, idf_dict)
    return tfidf

4. Utiliza una métrica de similitud como la distancia del coseno para obtener un valor de similitud. 

In [13]:
def calculate_cosine_similarity(tfidf1, tfidf2):
    """
    Calculates the cosine similarity between two TF-IDF vectors.

    Parameters:
    tfidf1 (dict): The TF-IDF vector representation of the first text.
    tfidf2 (dict): The TF-IDF vector representation of the second text.

    Returns:
    float: The cosine similarity score between the two texts.
    """
    common_words = set(tfidf1.keys()).intersection(set(tfidf2.keys()))

    dot_product = 0
    for word in common_words:
        dot_product += tfidf1[word] * tfidf2[word]
    
    magnitude1 = 0
    for val in tfidf1.values():
        magnitude1 += val ** 2
    magnitude1 = math.sqrt(magnitude1)
    
    magnitude2 = 0
    for val in tfidf2.values():
        magnitude2 += val ** 2
    magnitude2 = math.sqrt(magnitude2)
    
    if magnitude1 == 0 or magnitude2 == 0:
        return 0.0
    
    return dot_product / (magnitude1 * magnitude2)


5. Compara contra otros cuerpos de texto.(CODIGO)

In [14]:
def preprocess_code(code):
    """
    Preprocesses the code by tokenizing it.

    Parameters:
    code (str): The code to preprocess.

    Returns:
    list: The tokens in the code.
    """
    tokens = []
    try:
        bytes_io = BytesIO(code.encode('utf-8'))
        for token in tokenize.tokenize(bytes_io.readline):
            # Exclude specific token types that are not useful for analysis
            if token.type not in [
                tokenize.ENCODING, 
                tokenize.ENDMARKER, 
                tokenize.NEWLINE, 
                tokenize.NL, 
                tokenize.INDENT, 
                tokenize.DEDENT,
                tokenize.COMMENT
            ]:
                tokens.append(token.string.lower())
    except tokenize.TokenError:
        pass
    
    return tokens

In [15]:
ORIGINAL_TEXT = read_text("texto.txt")
SYNTETIC_TEXT = "texto_sintetico.txt"

#Problem 2
markov_model = markovify.Text(ORIGINAL_TEXT) # Build the Markov model
synthetic_text = generate_synthetic_text(markov_model, num_sentences=100)

save_text(SYNTETIC_TEXT, synthetic_text)

#Problem 3
original_words = preprocess_text(ORIGINAL_TEXT)
synthetic_words = preprocess_text(synthetic_text)
idf_dict = compute_idf([original_words, synthetic_words])

original_tfidf_vector = transform_to_tfidf_vector(ORIGINAL_TEXT, idf_dict)
synthetic_tfidf_vector = transform_to_tfidf_vector(synthetic_text, idf_dict)

print("Original TF-IDF vector:\n", original_tfidf_vector)
print("Synthetic TF-IDF vector:\n", synthetic_tfidf_vector)

#Problem 4
similarity_score = calculate_cosine_similarity(original_tfidf_vector, synthetic_tfidf_vector)
print("Cosine similarity:", similarity_score)


Original TF-IDF vector:
 {'kaladin': -0.18586206655536278, '(apodado': 0.0025297342356202382, 'kal)': 0.0025297342356202382, 'es': 0.0, 'niño': 0.0025297342356202382, 'que': -0.16921765255611218, 'vive': 0.0025297342356202382, 'en': -0.032696340679344064, 'pueblo': -0.010978667129753825, 'llamado': 0.0025297342356202382, 'piedralar': 0.0025297342356202382, 'su': -0.09398177590856059, 'madre': 0.0025297342356202382, 'hesina,': 0.0025297342356202382, 'hermano': -0.027653963258849515, 'tien': -0.018138004742978105, 'y': -0.13586968210512232, 'padre': -0.055117242447889826, 'lirin.': 0.0025297342356202382, 'cirujano': 0.0025297342356202382, 'experto': 0.0025297342356202382, 'cura': 0.0025297342356202382, 'las': -0.03794601353430357, 'personas': 0.0025297342356202382, 'heridas': 0.0025297342356202382, 'junto': 0.0025297342356202382, 'kaladin,': -0.018138004742978105, 'ya': 0.0, 'soporta': -0.008913675311566439, 'ver': -0.008913675311566439, 'la': -0.11060460755430025, 'sangre.': -0.00891367

In [16]:
#Problem 5
CODE1 = read_text("code1.cpp")
CODE2 = read_text("code2.cpp")

preprocess_code1 = preprocess_code(CODE1)
preprocess_code2 = preprocess_code(CODE2)
idf_dict = compute_idf([preprocess_code1, preprocess_code2])

code1_tfidf_vector = transform_to_tfidf_vector(CODE1, idf_dict)
code2_tfidf_vector = transform_to_tfidf_vector(CODE2, idf_dict)

print("Original TF-IDF vector:\n", code1_tfidf_vector)
print("Synthetic TF-IDF vector:\n", code2_tfidf_vector)

similarity_score_code = calculate_cosine_similarity(code1_tfidf_vector, code2_tfidf_vector)
print("Cosine similarity:", similarity_score_code)

Original TF-IDF vector:
 {'//': -0.2647391311512829, 'c++': 0.0, 'program': 0.0, "dijkstra's": 0.0, 'single': 0.0, 'source': 0.0, 'shortest': 0.0, 'path': 0.0, 'algorithm.': 0.0, 'adjacency': 0.0, 'matrix': 0.0, 'representation': 0.0, 'graph': 0.0, '#include': 0.0, '<iostream>': 0.0, 'using': 0.0, 'namespace': 0.0, 'std;': 0.0, 'number': 0.0, 'vertices': 0.0, '#define': 0.0, 'v': 0.0, '9': 0.0, 'utility': 0.0, 'function': 0.0, 'vertex': 0.0, 'minimum': 0.0, 'distance': 0.0, 'value,': 0.0, 'set': 0.0, 'included': 0.0, 'tree': 0.0, 'int': 0.0, 'mindistance(int': 0.0, 'dist[],': 0.0, 'bool': 0.0, 'sptset[])': 0.0, '{': 0.0, 'initialize': 0.0, 'min': 0.0, 'value': 0.0, '=': 0.0, 'int_max,': 0.0, 'min_index;': 0.0, '(int': 0.0, '0;': 0.0, '<': 0.0, 'v;': 0.0, 'v++)': 0.0, '(sptset[v]': 0.0, '==': 0.0, 'false': 0.0, '&&': 0.0, 'dist[v]': 0.0, '<=': 0.0, 'min)': 0.0, 'dist[v],': 0.0, 'min_index': 0.0, 'return': 0.0, '}': 0.0, 'print': 0.0, 'constructed': 0.0, 'array': 0.0, 'void': 0.0, 'print