In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, LeaveOneOut
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.datasets import load_iris, make_regression
from sklearn.metrics import accuracy_score, mean_squared_error

# Cross-validation in machine learning
# It’s a technique to check how well a model performs on new data.
# Instead of just splitting the dataset into training and test sets once, cross-validation tests the model multiple times on different subsets.
# Common approaches include:
# - Holdout Method: Splitting the dataset into training and test sets (e.g., 80%-20%)
# - K-Fold Cross-Validation: Dividing data into K parts, training on K-1 parts, and testing on the remaining part
# - Stratified K-Fold: Similar to K-Fold but ensures class proportions remain balanced
# - LOOCV (Leave-One-Out Cross Validation): Uses every single data point as a test set one at a time
# - Time Series Cross-Validation: Used for time-dependent data, ensuring past data is used to predict future data

def cross_validation_methods():
    """ Demonstrates different cross-validation techniques """
    X, y = load_iris(return_X_y=True)
    model = DecisionTreeClassifier()
    
    print("Holdout Cross Validation:")
    scores = cross_val_score(model, X, y, cv=5)
    print("Mean Accuracy:", np.mean(scores))
    
    print("\nK-Fold Cross Validation:")
    kf = KFold(n_splits=5)
    scores = cross_val_score(model, X, y, cv=kf)
    print("Mean Accuracy:", np.mean(scores))
    
    print("\nStratified K-Fold Cross Validation:")
    skf = StratifiedKFold(n_splits=5)
    scores = cross_val_score(model, X, y, cv=skf)
    print("Mean Accuracy:", np.mean(scores))
    
    print("\nLeave-One-Out Cross Validation:")
    loo = LeaveOneOut()
    scores = cross_val_score(model, X, y, cv=loo)
    print("Mean Accuracy:", np.mean(scores))

# Assumptions of Linear Regression
# - Linearity: The relationship between input and output should be a straight line
# - Independence: Data points should not be dependent on each other
# - Homoscedasticity: The spread of errors should be consistent across values
# - Normality of Residuals: Errors should follow a normal distribution
# - No Multicollinearity: Independent variables should not be highly correlated

def linear_regression_assumptions():
    """ Checks assumptions of Linear Regression """
    X, y = make_regression(n_samples=100, n_features=1, noise=10)
    model = LinearRegression()
    model.fit(X, y)
    predictions = model.predict(X)
    
    # Plot Residuals
    residuals = y - predictions
    plt.figure(figsize=(10,5))
    sns.scatterplot(x=predictions, y=residuals)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.title("Residual Plot to Check Homoscedasticity")
    plt.show()

# Machine Learning Models
# - Decision Tree: A simple, rule-based model that splits data based on conditions
# - Random Forest: A collection of decision trees improving accuracy and reducing overfitting
# - SVM (Support Vector Machine): Finds the best boundary between categories, useful for high-dimensional data

def compare_models():
    """ Compares Decision Tree, Random Forest, and SVM on the Iris dataset """
    X, y = load_iris(return_X_y=True)
    
    models = {
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(),
        "SVM": SVC()
    }
    
    for name, model in models.items():
        model.fit(X, y)
        predictions = model.predict(X)
        acc = accuracy_score(y, predictions)
        print(f"{name} Accuracy: {acc:.2f}")

# Run the functions
cross_validation_methods()
linear_regression_assumptions()
compare_models()
