In [27]:
import numpy as np
import pandas as pd
import statistics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity


In [28]:
file_path = r"Lab Session Data.xlsx"
xls = pd.ExcelFile(file_path)


In [None]:
#Q1
def evaluate_purchase_data():
    try:
        purchase_df = pd.read_excel(excel_file, sheet_name="Purchase data")
        purchase_mat = purchase_df.iloc[:, 1:4].values
        purchase_vals = purchase_df.iloc[:, 4].values.reshape(-1, 1)
        mat_dimensionality = purchase_mat.shape[1]
        vector_count = purchase_mat.shape[0]
        mat_rank = np.linalg.matrix_rank(purchase_mat)
        purchase_mat_pinv = np.linalg.pinv(purchase_mat)
        item_costs = np.dot(purchase_mat_pinv, purchase_vals).flatten()
        print("Evaluation Results:")
        print(f"Dimensionality: {mat_dimensionality}")
        print(f"Number of Vectors: {vector_count}")
        print(f"Rank of Matrix: {mat_rank}")
        print(f"Item Costs: {item_costs}")
        return mat_dimensionality, vector_count, mat_rank, item_costs
    except FileNotFoundError:
        print(f"Error: File not found at {excel_file_path}")
        return None, None, None, None
    except ValueError:
        print("Error: Could not read specified sheet from Excel file.")
        return None, None, None, None


In [None]:
#Q2
def calculate_model_vector():
    _, _, _, item_costs = evaluate_purchase_data()
    if item_costs is not None:
        print("Evaluation Result:")
        print(f"Model Vector X (Item Costs): {item_costs}")
        return item_costs
    else:
        return None


In [None]:
#Q3
def categorize_customers():
    try:
        purchase_df = pd.read_excel(excel_file, sheet_name="Purchase data")
        purchase_df["Customer Category"] = np.where(purchase_df.iloc[:, 4] > 200, "RICH", "POOR")
        print("Evaluation Result:")
        print(purchase_df[["Customer Category"]])
        return purchase_df[["Customer Category"]]
    except ValueError:
        print("Error: Could not read 'Purchase data' from Excel.")
        return None

In [None]:
#Q4
def evaluate_irctc_stock():
    try:
        stock_df = pd.read_excel(excel_file, sheet_name="IRCTC Stock Price")
        stock_df["Date"] = pd.to_datetime(stock_df["Date"])
        stock_df["Day"] = stock_df["Date"].dt.day_name()
        avg_price = stats.mean(stock_df["Price"])
        var_price = stats.variance(stock_df["Price"])
        wed_avg_price = stock_df[stock_df["Day"] == "Wednesday"]["Price"].mean()
        april_avg_price = stock_df[stock_df["Date"].dt.month == 4]["Price"].mean()
        loss_prob = (stock_df["Chg%"] < 0).mean()
        profit_prob_wed = stock_df[(stock_df["Day"] == "Wednesday") & (stock_df["Chg%"] > 0)]["Chg%"].count() / stock_df[stock_df["Day"] == "Wednesday"]["Chg%"].count()
        print("Evaluation Results:")
        print(f"Mean Price: {avg_price}")
        print(f"Variance Price: {var_price}")
        print(f"Wednesday Mean Price: {wed_avg_price}")
        print(f"April Mean Price: {april_avg_price}")
        print(f"Probability of Loss: {loss_prob}")
        print(f"Probability of Profit on Wednesday: {profit_prob_wed}")
        plt.figure(figsize=(10, 5))
        sns.scatterplot(x=stock_df["Day"], y=stock_df["Chg%"])
        plt.xlabel("Day of the Week")
        plt.ylabel("Change %")
        plt.xticks(rotation=45)
        plt.title("Change % vs. Day of the Week")
        plt.tight_layout()
        plt.show()
        return avg_price, var_price, wed_avg_price, april_avg_price, loss_prob, profit_prob_wed
    except FileNotFoundError:
        print(f"Error: File not found at {excel_file_path}")
        return None, None, None, None, None, None
    except KeyError:
        print("Error: One or more required columns ('Price', 'Chg%', 'Date') are missing from the Excel sheet.")
        return None, None, None, None, None, None
    except ValueError:
        print("Error: Could not read 'IRCTC Stock Price' from Excel.")
        return None, None, None, None, None, None

In [None]:
#Q5
def analyze_thyroid_data():
    try:
        thyroid_df = pd.read_excel(excel_file, sheet_name="thyroid0387_UCI")
        thyroid_df.replace('?', np.nan, inplace=True)
        thyroid_df = thyroid_df.infer_objects()
        missing_vals = thyroid_df.isnull().sum()
        categorical_columns = thyroid_df.select_dtypes(include=['object']).columns
        for column in categorical_columns:
            thyroid_df[column] = thyroid_df[column].astype(str)
            thyroid_df[column] = LE().fit_transform(thyroid_df[column])
        print("Evaluation Results:")
        print(thyroid_df.describe())
        print("Missing Values:\n", missing_vals)
        return thyroid_df.describe(), missing_vals
    except FileNotFoundError:
        print(f"Error: File not found at {excel_file_path}")
        return None, None
    except ValueError:
        print("Error: Could not read specified sheet from Excel file.")
        return None, None


In [None]:
#Q6
def fill_missing_data():
    try:
        thyroid_df = pd.read_excel(excel_file, sheet_name="thyroid0387_UCI")
        thyroid_df.replace('?', np.nan, inplace=True)
        thyroid_df = thyroid_df.infer_objects()
        for column in thyroid_df.columns:
            if thyroid_df[column].dtype in ['float64', 'int64']:
                thyroid_df[column] = thyroid_df[column].fillna(thyroid_df[column].median())
            else:
                thyroid_df[column] = thyroid_df[column].fillna(thyroid_df[column].mode()[0])
        print("Evaluation Results:")
        print(thyroid_df)
        return thyroid_df
    except FileNotFoundError:
        print(f"Error: File not found at {excel_file_path}")
        return None
    except ValueError:
        print("Error: Could not read specified sheet from Excel file.")
        return None

In [None]:
#Q7
def standardize_data():
    thyroid_df = fill_missing_data()
    if thyroid_df is None:
        return None
    categorical_columns = thyroid_df.select_dtypes(include=['object']).columns
    for column in categorical_columns:
        thyroid_df[column] = LE().fit_transform(thyroid_df[column])
    numerical_columns = thyroid_df.select_dtypes(include=['float64', 'int64']).columns
    scaler = MMS()
    thyroid_df[numerical_columns] = scaler.fit_transform(thyroid_df[numerical_columns])
    print("Evaluation Results:")
    print(thyroid_df)
    return thyroid_df


In [None]:
#Q8
def compute_jaccard_smc():
    standardized_df = standardize_data()
    if standardized_df is None:
        return None, None
    vec1 = standardized_df.iloc[0, :].values
    vec2 = standardized_df.iloc[1, :].values
    f11 = np.sum((vec1 == 1) & (vec2 == 1))
    f00 = np.sum((vec1 == 0) & (vec2 == 0))
    f10 = np.sum((vec1 == 1) & (vec2 == 0))
    f01 = np.sum((vec1 == 0) & (vec2 == 1))
    denom = (f01 + f10 + f11)
    jaccard_coeff = f11 / denom if denom != 0 else 0
    smc = (f11 + f00) / (f00 + f01 + f10 + f11) if (f00 + f01 + f10 + f11) != 0 else 0
    print("Evaluation Results:")
    print(f"Jaccard Coefficient: {jaccard_coeff}, SMC: {smc}")
    return jaccard_coeff, smc


In [None]:
#Q9
def compute_cosine_similarity():
    standardized_df = standardize_data()
    if standardized_df is None:
        return None
    vec1 = standardized_df.iloc[0, :].values.reshape(1, -1)
    vec2 = standardized_df.iloc[1, :].values.reshape(1, -1)
    cosine_sim = cos_sim(vec1, vec2)[0][0]
    print("Evaluation Result:", cosine_sim)
    return cosine_sim


In [None]:
#Q10
def plot_dissimilarity_heatmap():
    standardized_df = standardize_data()
    if standardized_df is None:
        return None
    df_subset = standardized_df.iloc[:20, :]
    dissimilarity_matrix = np.zeros((20, 20))
    for i in range(20):
        for j in range(20):
            if i != j:
                dissimilarity_matrix[i, j] = np.linalg.norm(df_subset.iloc[i] - df_subset.iloc[j])
    plt.figure(figsize=(10, 8))
    sns.heatmap(dissimilarity_matrix, annot=False, cmap='coolwarm')
    plt.title("Heatmap of Euclidean Distances (Dissimilarity)")
    plt.tight_layout()
    plt.show()
    print("Evaluation Result: (Euclidean Distance Matrix - Not Printed for brevity)")
    return dissimilarity_matrix