<a href="https://colab.research.google.com/github/SharvinKumarArumugam/Githubproject/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# i. Function to load the dataset
def load_dataset(births_file, population_file, common_key):
    births_data = pd.read_csv(births_file)
    population_data = pd.read_csv(population_file)
    merged_data = pd.merge(births_data, population_data, on=common_key)
    return merged_data

# ii. Function for data pre-processing and visualization using PCA
def preprocess_and_visualize(data, features, target):
    X = data[features]
    y = data[target]

    # Standardize features using StandardScaler
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Apply PCA for data visualization
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)

    # Plot PCA visualization
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis')
    plt.title('PCA Visualization')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.show()

    return X_scaled, y

# iii. Mathematical formulas
# 1. Calculate mean square error
def calculate_mse(predictions, actual):
    mse = np.mean((predictions - actual)**2)
    return mse

# 2. Compute coefficients (weights) using matrix operations
def compute_coefficients(X, y):
    # Add a column of ones for the bias term
    X = np.c_[np.ones(X.shape[0]), X]

    # Compute coefficients using matrix operations (closed-form solution)
    weights = np.linalg.inv(X.T @ X) @ X.T @ y

    return weights

# 3. Model Training and Prediction
# a. Data Split: Divide the dataset into training and testing sets
def split_data(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

# b. Training: Train your model using the training set
def train_model(X_train, y_train):
    weights = compute_coefficients(X_train, y_train)
    return weights

# c. Prediction: Use the trained model to make predictions on the testing set
def predict(X_test, weights):
    X_test_bias = np.c_[np.ones(X_test.shape[0]), X_test]  # Add bias term to test set
    predictions = X_test_bias @ weights
    return predictions

# 4. Evaluation
# a. Assessment: Evaluate your model's performance by computing the mean square error (MSE)
def evaluate_model(predictions, actual):
    mse = calculate_mse(predictions, actual)
    return mse

# b. Comparison: Compare the model’s predictions against the actual values
def compare_predictions(predictions, actual):
    comparison_df = pd.DataFrame({'Actual': actual, 'Predicted': predictions})
    return comparison_df

# Main function
def main():
    # Load dataset
    common_key = 'common_key'  # Replace with your common key
    data = load_dataset('births.csv', 'population.csv', common_key)

    # Pre-process and visualize data using PCA
    features = ['feature1', 'feature2', '...']  # Replace with your features
    target = 'target_variable'  # Replace with your target variable
    X_scaled, y = preprocess_and_visualize(data, features, target)

    # Data Split
    X_train, X_test, y_train, y_test = split_data(X_scaled, y)

    # Model Training
    weights = train_model(X_train, y_train)

    # Model Prediction
    predictions = predict(X_test, weights)

    # Model Evaluation
    mse = evaluate_model(predictions, y_test)
    print(f"Mean Square Error on Test Data: {mse}")

    # Model Comparison
    comparison_df = compare_predictions(predictions, y_test)
    print("Model Comparison:")
    print(comparison_df)

# Execute the main function
if __name__ == "__main__":
    main()


FileNotFoundError: ignored