In [27]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

def calculate_mean_vector(tfidf_matrix):
    """
    Calculate the mean vector for all documents based on TF-IDF vectors.

    Parameters:
    - tfidf_matrix (numpy.ndarray): The TF-IDF matrix where each row is a text vector.

    Returns:
    - mean_vector (numpy.ndarray): The mean vector of all TF-IDF vectors.
    """
    mean_vector = np.mean(tfidf_matrix, axis=0)  # Mean vector across all documents
    return mean_vector

def calculate_std_vector(tfidf_matrix):
    """
    Calculate the standard deviation vector for all documents based on TF-IDF vectors.

    Parameters:
    - tfidf_matrix (numpy.ndarray): The TF-IDF matrix where each row is a text vector.

    Returns:
    - std_vector (numpy.ndarray): The standard deviation vector of all TF-IDF vectors.
    """
    std_vector = np.std(tfidf_matrix, axis=0)  # Standard deviation vector across all documents
    return std_vector

def calculate_scalar_mean(mean_vector):
    """
    Calculate a single scalar value from the mean vector.

    Parameters:
    - mean_vector (numpy.ndarray): The mean vector of all TF-IDF vectors.

    Returns:
    - scalar_mean (float): The mean of the mean vector components.
    """
    scalar_mean = np.mean(mean_vector)  # Mean of all components in the mean vector
    return scalar_mean

def calculate_scalar_std(std_vector):
    """
    Calculate a single scalar value from the standard deviation vector.

    Parameters:
    - std_vector (numpy.ndarray): The standard deviation vector of all TF-IDF vectors.

    Returns:
    - scalar_std (float): The mean of the standard deviation vector components.
    """
    scalar_std = np.mean(std_vector)  # Mean of all components in the standard deviation vector
    return scalar_std

def calculate_distance(vector1, vector2):
    """
    Calculate the Euclidean distance between two vectors.

    Parameters:
    - vector1 (numpy.ndarray): The first vector.
    - vector2 (numpy.ndarray): The second vector.

    Returns:
    - distance (float): The Euclidean distance between the two vectors.
    """
    distance = np.linalg.norm(vector1 - vector2)
    return distance

def main():
    # Load the dataset from the Excel file
    df = pd.read_excel('/content/Book1.xlsx')

    # Strip leading/trailing spaces from column names
    df.columns = df.columns.str.strip()

    # Extract texts from ENGLISH and HINDI columns
    english_texts = df['ENGLISH'].tolist()
    hindi_texts = df['HINDI'].tolist()

    # Combine texts for consistent TF-IDF vectorizer fitting
    combined_texts = english_texts + hindi_texts
    combined_labels = ['ENGLISH'] * len(english_texts) + ['HINDI'] * len(hindi_texts)

    # Create a TF-IDF Vectorizer and fit_transform the combined text data
    vectorizer = TfidfVectorizer()
    tfidf_matrix_combined = vectorizer.fit_transform(combined_texts).toarray()

    # Split the combined matrix back into ENGLISH and HINDI
    split_index = len(english_texts)
    tfidf_matrix_english = tfidf_matrix_combined[:split_index]
    tfidf_matrix_hindi = tfidf_matrix_combined[split_index:]

    # Calculate mean and standard deviation vectors for ENGLISH and HINDI
    mean_vector_english = calculate_mean_vector(tfidf_matrix_english)
    std_vector_english = calculate_std_vector(tfidf_matrix_english)

    mean_vector_hindi = calculate_mean_vector(tfidf_matrix_hindi)
    std_vector_hindi = calculate_std_vector(tfidf_matrix_hindi)

    # Calculate scalar means from the mean vectors
    mean_english = calculate_scalar_mean(mean_vector_english)
    mean_hindi = calculate_scalar_mean(mean_vector_hindi)

    # Calculate scalar standard deviations from the standard deviation vectors
    std_english = calculate_scalar_std(std_vector_english)
    std_hindi = calculate_scalar_std(std_vector_hindi)

    # Calculate distance between mean vectors
    distance_mean_vectors = calculate_distance(mean_vector_english, mean_vector_hindi)

    # Print scalar means, standard deviations, and distance between mean vectors
    print("Mean for ENGLISH:")
    print(mean_english)
    print("Standard Deviation for ENGLISH:")
    print(std_english)

    print("Mean for HINDI:")
    print(mean_hindi)
    print("Standard Deviation for HINDI:")
    print(std_hindi)

    print("Distance between mean vectors (ENGLISH vs HINDI):")
    print(distance_mean_vectors)

if __name__ == "__main__":
    main()


Mean for ENGLISH:
0.0007304245394682649
Standard Deviation for ENGLISH:
0.012986468614349269
Mean for HINDI:
0.0005302260084220965
Standard Deviation for HINDI:
0.006542582317355682
Distance between mean vectors (ENGLISH vs HINDI):
0.20573735074386956
