In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np

# Function to load the data from Excel
def load_data(file_path):
    df = pd.read_excel(file_path)

    # Extract the 'ENGLISH' and 'HINDI' columns
    english_texts = df['ENGLISH'].astype(str).values  # Convert all values to strings
    hindi_texts = df['HINDI'].astype(str).values  # Convert all values to strings

    return english_texts, hindi_texts

# Function to convert text data to numerical features using TF-IDF
def vectorize_text(english_texts, hindi_texts):
    vectorizer = TfidfVectorizer()

    # Vectorize the English and Hindi texts
    x = vectorizer.fit_transform(english_texts).toarray()  # Convert English text to TF-IDF feature vectors
    y = vectorizer.transform(hindi_texts).toarray()  # Convert Hindi text to TF-IDF feature vectors

    return x, y

# Function to train the Linear Regression model
def train_linear_regression(x_train, y_train):
    reg = LinearRegression()
    reg.fit(x_train, y_train)
    return reg

# Function to evaluate the model
def evaluate_model(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mse, rmse, mape, r2

# Main function to execute the code
def main():
    # Load the data from 'Book1.xlsx'
    file_path = 'Book1.xlsx'
    english_texts, hindi_texts = load_data(file_path)

    # Convert text to numerical features using TF-IDF
    x, y = vectorize_text(english_texts, hindi_texts)

    # Split the data into training and test sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    # Train the model
    reg = train_linear_regression(x_train, y_train)

    # Predict the target values for both train and test sets
    y_train_pred = reg.predict(x_train)
    y_test_pred = reg.predict(x_test)

    # Evaluate the model on both train and test sets
    mse_train, rmse_train, mape_train, r2_train = evaluate_model(y_train, y_train_pred)
    mse_test, rmse_test, mape_test, r2_test = evaluate_model(y_test, y_test_pred)

    # Print results
    print("Train Set Metrics:")
    print(f"Mean Squared Error (MSE): {mse_train}")
    print(f"Root Mean Squared Error (RMSE): {rmse_train}")
    print(f"Mean Absolute Percentage Error (MAPE): {mape_train}")
    print(f"R^2 Score: {r2_train}")

    print("\nTest Set Metrics:")
    print(f"Mean Squared Error (MSE): {mse_test}")
    print(f"Root Mean Squared Error (RMSE): {rmse_test}")
    print(f"Mean Absolute Percentage Error (MAPE): {mape_test}")
    print(f"R^2 Score: {r2_test}")

# Call the main function
if __name__ == "__main__":
    main()


Train Set Metrics:
Mean Squared Error (MSE): 1.2218911268510803e-35
Root Mean Squared Error (RMSE): 3.495555931251966e-18
Mean Absolute Percentage Error (MAPE): 0.0002433346388138719
R^2 Score: 1.0

Test Set Metrics:
Mean Squared Error (MSE): 2.020652679733334e-06
Root Mean Squared Error (RMSE): 0.0014214966337397124
Mean Absolute Percentage Error (MAPE): 12695989952.006954
R^2 Score: 0.9991666802718867
