In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

def load_data(file_path, sheet_name):
    """Load data from an Excel sheet and return a DataFrame."""
    try:
        df = pd.read_excel(file_path, sheet_name=sheet_name)
        return df
    except FileNotFoundError:
        raise FileNotFoundError(f"Error: The file at path '{file_path}' was not found.")
    except ValueError as ve:
        raise ValueError(f"Value Error: {ve}")
    except Exception as e:
        raise Exception(f"An error occurred: {e}")

def compute_mean_variance(A):
    """Compute mean and variance of matrix A."""
    mean = np.mean(A, axis=0)
    variance = np.var(A, axis=0)
    return mean, variance

def impute_and_classify(df):
    """Impute missing data and classify customers."""
    # Impute missing values
    imputer = SimpleImputer(strategy='mean')
    imputed_features = imputer.fit_transform(df[['Candies (#)', 'Mangoes (Kg)', 'Milk Packets (#)']])

    # Prepare features and labels
    df['Class'] = np.where(df['Payment (Rs)'] > 200, 'RICH', 'POOR')
    X = imputed_features
    y = df['Class'].values

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

    # Train a logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Return classification report with zero_division parameter
    return classification_report(y_test, y_pred, zero_division=1)

def main():
    """Main function to execute the tasks."""
    file_path = "/content/Lab Session Data.xlsx"
    sheet_name = 'Purchase data'
    df = load_data(file_path, sheet_name)

    # Ensure the dataset has enough rows
    num_rows = df.shape[0]
    sample_size = min(20, num_rows)

    if num_rows < 20:
        print(f"Warning: Dataset contains only {num_rows} rows. Sampling will use {sample_size} rows.")

    # Randomly sample observations from the dataset
    sampled_df = df.sample(n=sample_size, random_state=42)

    # Create the matrix A from the sampled data
    A_sampled = sampled_df[['Candies (#)', 'Mangoes (Kg)', 'Milk Packets (#)']].values

    # Perform Experiment A4: Compute mean and variance
    mean, variance = compute_mean_variance(A_sampled)
    print("\nMean of the sampled data:")
    print(mean)
    print("\nVariance of the sampled data:")
    print(variance)

    # Perform Experiment A6: Impute missing values and classify
    classification_report_str = impute_and_classify(sampled_df)
    print("\nClassification Report (Sampled):")
    print(classification_report_str)

if __name__ == "__main__":
    main()



Mean of the sampled data:
[19.8  3.2  3.1]

Variance of the sampled data:
[13.16  3.36  2.09]

Classification Report (Sampled):
              precision    recall  f1-score   support

        POOR       0.50      1.00      0.67         1
        RICH       1.00      0.50      0.67         2

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3

