In [None]:
#Dataset
https://www.kaggle.com/datasets/shwetabh123/mall-customers
https://www.kaggle.com/datasets/camnugent/california-housing-prices
https://www.kaggle.com/datasets/abhi8923shriv/liver-disease-patient-dataset

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv('Mall_customers.csv')

# 1. Correlation Matrix
# Select numerical columns for correlation (Age, Annual Income, and Spending Score)
df_numeric = df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]

# Calculate the correlation matrix
correlation_matrix = df_numeric.corr()

# Plot the correlation matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Distribution Plot 
plt.figure(figsize=(12, 6))

# Distribution for Age
plt.subplot(1, 3, 1)
sns.histplot(df['Age'], kde=True, color='skyblue')
plt.title('Distribution of Age')

# Distribution for Annual Income 
plt.subplot(1, 3, 2)
sns.histplot(df['Annual Income (k$)'], kde=True, color='green')
plt.title('Distribution of Annual Income (k$)')

# Distribution for Spending Score 
plt.subplot(1, 3, 3)
sns.histplot(df['Spending Score (1-100)'], kde=True, color='orange')
plt.title('Distribution of Spending Score (1-100)')

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv('Mall_customers.csv')

# Select features for clustering
X = df[['Age', 'Spending Score (1-100)', 'Annual Income (k$)']]

# Standardize the data 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Elbow Method for Optimal K
inertia = []  # Store inertia values for each K
for k in range(1, 11):  # Test different values of K (1 to 10 clusters)
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plot the Elbow (Scree) Plot
plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), inertia, marker='o', color='b', linestyle='--')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia (Sum of Squared Distances)')
plt.xticks(range(1, 11))
plt.grid(True)

# Find the optimal number of clusters (K) based on the "elbow"
optimal_k = 3  # Based on the elbow method or visual inspection (assumed K=3 for this case)

plt.axvline(x=optimal_k, color='red', linestyle=':', label=f'Optimal K={optimal_k}')
plt.legend()
plt.show()

# Apply K-Means with the optimal number of clusters
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(X_scaled)

# Step 1: Box Plot for Age vs Cluster
plt.figure(figsize=(8, 6))
plt.boxplot([df[df['Cluster'] == i]['Age'] for i in range(optimal_k)], labels=[f'Cluster {i}' for i in range(optimal_k)])
plt.title('Box Plot of Age by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Age')
plt.show()

# Step 2: Box Plot for Spending Score vs Cluster
plt.figure(figsize=(8, 6))
plt.boxplot([df[df['Cluster'] == i]['Spending Score (1-100)'] for i in range(optimal_k)], labels=[f'Cluster {i}' for i in range(optimal_k)])
plt.title('Box Plot of Spending Score by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Spending Score (1-100)')
plt.show()

# Step 3: Box Plot for Income vs Cluster
plt.figure(figsize=(8, 6))
plt.boxplot([df[df['Cluster'] == i]['Annual Income (k$)'] for i in range(optimal_k)], labels=[f'Cluster {i}' for i in range(optimal_k)])
plt.title('Box Plot of Income by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Annual Income (k$)')
plt.show()

# Step 4: Calculate and Display Mean Values for Each Cluster
cluster_means = df.groupby('Cluster')[['Age', 'Spending Score (1-100)', 'Annual Income (k$)']].mean()
print("\nMean Values for Each Cluster:")
print(cluster_means)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv('Mall_customers.csv')

# Select features for clustering (Age, Spending Score, and Annual Income)
X = df[['Age', 'Spending Score (1-100)', 'Annual Income (k$)']]

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply GMM with 3 clusters
optimal_k = 3  # Number of clusters (set manually or from previous analysis)
gmm = GaussianMixture(n_components=optimal_k, random_state=42)
df['Cluster'] = gmm.fit_predict(X_scaled)

# Step 1: Box Plot for Age vs Cluster
plt.figure(figsize=(8, 6))
plt.boxplot([df[df['Cluster'] == i]['Age'] for i in range(optimal_k)], labels=[f'Cluster {i}' for i in range(optimal_k)])
plt.title('Box Plot of Age by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Age')
plt.show()

# Step 2: Box Plot for Spending Score vs Cluster
plt.figure(figsize=(8, 6))
plt.boxplot([df[df['Cluster'] == i]['Spending Score (1-100)'] for i in range(optimal_k)], labels=[f'Cluster {i}' for i in range(optimal_k)])
plt.title('Box Plot of Spending Score by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Spending Score (1-100)')
plt.show()

# Step 3: Box Plot for Income vs Cluster
plt.figure(figsize=(8, 6))
plt.boxplot([df[df['Cluster'] == i]['Annual Income (k$)'] for i in range(optimal_k)], labels=[f'Cluster {i}' for i in range(optimal_k)])
plt.title('Box Plot of Income by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Annual Income (k$)')
plt.show()

# Step 4: Calculate and Display Mean Values for Each Cluster
cluster_means = df.groupby('Cluster')[['Age', 'Spending Score (1-100)', 'Annual Income (k$)']].mean()
print("\nMean Values for Each Cluster:")
print(cluster_means)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
import scipy.cluster.hierarchy as sch

# Load the dataset
df = pd.read_csv('Mall_customers.csv')

# Select features for clustering (Age, Spending Score, and Annual Income)
X = df[['Age', 'Spending Score (1-100)', 'Annual Income (k$)']]

# Standardize the data 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply HAC with 3 clusters 
hac = AgglomerativeClustering(n_clusters=3)
df['Cluster'] = hac.fit_predict(X_scaled)

# Step 1: Box Plot for Age vs Cluster
plt.figure(figsize=(8, 6))
plt.boxplot([df[df['Cluster'] == i]['Age'] for i in range(3)], labels=[f'Cluster {i}' for i in range(3)])
plt.title('Box Plot of Age by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Age')
plt.show()

# Step 2: Box Plot for Spending Score vs Cluster
plt.figure(figsize=(8, 6))
plt.boxplot([df[df['Cluster'] == i]['Spending Score (1-100)'] for i in range(3)], labels=[f'Cluster {i}' for i in range(3)])
plt.title('Box Plot of Spending Score by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Spending Score (1-100)')
plt.show()

# Step 3: Box Plot for Income vs Cluster
plt.figure(figsize=(8, 6))
plt.boxplot([df[df['Cluster'] == i]['Annual Income (k$)'] for i in range(3)], labels=[f'Cluster {i}' for i in range(3)])
plt.title('Box Plot of Income by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Annual Income (k$)')
plt.show()

# Step 4: Dendrogram for HAC
plt.figure(figsize=(12, 8))
Z = sch.linkage(X_scaled, method='ward')  # Create linkage matrix using Ward's method
sch.dendrogram(Z, no_labels=True)  # Remove labels on the x-axis to avoid clutter
plt.title('Dendrogram for HAC')
plt.xlabel('Customer Index')  # Simplified x-axis label
plt.ylabel('Distance')
plt.show()

# Step 5: Calculate and Display Mean Values for Each Cluster
cluster_means = df.groupby('Cluster')[['Age', 'Spending Score (1-100)', 'Annual Income (k$)']].mean()
print("\nMean Values for Each Cluster:")
print(cluster_means)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('Housing.csv')

# Calculate the overall median of 'median_house_value'
overall_median = df['median_house_value'].median()

# Create a Housing Median Price Index
df['housing_median_price_index'] = df['median_house_value'] / overall_median

# Plot the histogram for the Housing Median Price Index
plt.figure(figsize=(10, 6))
plt.hist(df['housing_median_price_index'], bins=30, edgecolor='black')
plt.title('Distribution of Housing Median Price Index')
plt.xlabel('Housing Median Price Index')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
# Step 1: Load the dataset
df = pd.read_csv('Housing.csv')

# Step 2: One-Hot Encoding for the categorical column 'ocean_proximity'
df = pd.get_dummies(df, columns=['ocean_proximity'], drop_first=True)

# Step 3: Drop rows with missing values
df_clean = df.dropna()

# Step 4: Define features and target
X = df_clean.drop(columns=['median_house_value'])  # Features
y = df_clean['median_house_value']  # Target

# Step 5: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model 1: Linear Regression vs Polynomial Linear Regression
# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lin = lin_reg.predict(X_test)

# Polynomial Linear Regression
poly = PolynomialFeatures(degree=2)  # You can adjust the degree
X_poly = poly.fit_transform(X_train)
poly_reg = LinearRegression()
poly_reg.fit(X_poly, y_train)
X_test_poly = poly.transform(X_test)
y_pred_poly = poly_reg.predict(X_test_poly)

# Calculate performance metrics for Linear and Polynomial Regression
r2_lin = r2_score(y_test, y_pred_lin)
r2_poly = r2_score(y_test, y_pred_poly)
rmse_lin = np.sqrt(mean_squared_error(y_test, y_pred_lin))  # RMSE instead of MSE
rmse_poly = np.sqrt(mean_squared_error(y_test, y_pred_poly))  # RMSE instead of MSE

# Model 2: Lasso Regression 
lasso = Lasso(alpha=0.1)  # You can adjust the alpha value
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)
r2_lasso = r2_score(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))  # RMSE instead of MSE

# Model 3: Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
r2_rf = r2_score(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))  # RMSE instead of MSE

# Performance Comparison
print("Linear Regression:")
print(f"R²: {r2_lin:.4f}, RMSE: {rmse_lin:.4f}")

print("\nPolynomial Linear Regression:")
print(f"R²: {r2_poly:.4f}, RMSE: {rmse_poly:.4f}")

print("\nLasso Regression:")
print(f"R²: {r2_lasso:.4f}, RMSE: {rmse_lasso:.4f}")

print("\nRandom Forest:")
print(f"R²: {r2_rf:.4f}, RMSE: {rmse_rf:.4f}")

# Scatter Plot for all Models
# Create a figure
plt.figure(figsize=(14, 10))

# Scatter plot for Linear Regression
plt.subplot(2, 2, 1)
plt.scatter(y_test, y_pred_lin, color='lightblue', edgecolor='black', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')  # Line of perfect prediction
plt.title("Linear Regression: Actual vs Predicted")
plt.xlabel("Actual Median House Value")
plt.ylabel("Predicted Median House Value")

# Scatter plot for Polynomial Linear Regression
plt.subplot(2, 2, 2)
plt.scatter(y_test, y_pred_poly, color='lightblue', edgecolor='black', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')  # Line of perfect prediction
plt.title("Polynomial Linear Regression: Actual vs Predicted")
plt.xlabel("Actual Median House Value")
plt.ylabel("Predicted Median House Value")

# Scatter plot for Lasso Regression
plt.subplot(2, 2, 3)
plt.scatter(y_test, y_pred_lasso, color='lightblue', edgecolor='black', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')  # Line of perfect prediction
plt.title("Lasso Regression: Actual vs Predicted")
plt.xlabel("Actual Median House Value")
plt.ylabel("Predicted Median House Value")

# Scatter plot for Random Forest
plt.subplot(2, 2, 4)
plt.scatter(y_test, y_pred_rf, color='lightblue', edgecolor='black', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')  # Line of perfect prediction
plt.title("Random Forest: Actual vs Predicted")
plt.xlabel("Actual Median House Value")
plt.ylabel("Predicted Median House Value")

# layout and show plot
plt.tight_layout()
plt.show()

# Feature importance for Random Forest
feature_importances = rf.feature_importances_

# Visualize feature importance (Random Forest)
plt.figure(figsize=(10, 6))
plt.barh(X.columns, feature_importances, color='skyblue')
plt.xlabel('Feature Importance')
plt.title('Feature Importance (Random Forest)')
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the liver disease dataset
df = pd.read_csv('Liver.csv', encoding='ISO-8859-1')  # Change the file path accordingly

# Encode categorical columns
df['Gender of the patient'] = df['Gender of the patient'].map({'Male': 1, 'Female': 0})

# Drop rows with missing values
df = df.dropna()

# 1. Correlation Matrix
# Calculate the correlation matrix for numerical columns
corr_matrix = df.corr()

# Plot the correlation matrix using seaborn heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

In [None]:
# 2. Count Plot
# Plot the count plot for the target variable 'Result' to check the balance
plt.figure(figsize=(8, 6))
sns.countplot(x='Result', data=df)  # Removed palette to prevent the warning
plt.title('Distribution of Target Variable (Liver Disease)')
plt.xlabel('Result (0 = No, 1 = Yes)')
plt.ylabel('Count')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, confusion_matrix, roc_curve, ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTE

# Load the dataset
df = pd.read_csv('Liver.csv', encoding='ISO-8859-1')

# Remove rows with missing values
df = df.dropna()

# Separate features and target
X = df.drop('Result', axis=1)  # 'Result' is the target column
y = df['Result']

# Convert target variable (class 2 → 1 and class 1 → 0 for binary classification)
y = y - 1  

# Encode categorical features
categorical_columns = X.select_dtypes(include=['object']).columns
encoder = LabelEncoder()
for col in categorical_columns:
    X[col] = encoder.fit_transform(X[col])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Initialize models with class balancing where applicable
models = {
    'Random Forest': RandomForestClassifier(class_weight='balanced'),
    'KNN': KNeighborsClassifier(),
    'SVC': SVC(probability=True, class_weight='balanced'),
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced')
}

# Evaluate models
for model_name, model in models.items():
    # Train all models on SMOTE-balanced data
    model.fit(X_train_smote, y_train_smote)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]  # Probability for class 1
    
    # Calculate metrics (Using roc_auc_score directly)
    auc_value = roc_auc_score(y_test, y_prob)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Print Model Performance Metrics
    print(f"\n=== {model_name} ===")
    print(f"ROC AUC Score: {auc_value:.4f}")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # Plot ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    plt.figure(figsize=(6, 6))
    plt.plot(fpr, tpr, color='b', label=f'ROC AUC = {auc_value:.2f}')
    plt.plot([0, 1], [0, 1], color='grey', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for {model_name}')
    plt.legend(loc='lower right')
    plt.show()

    # Plot Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Class 1', 'Class 2'])
    disp.plot(cmap='Blues')
    plt.title(f'Confusion Matrix for {model_name}')
    plt.show()

    # Plot Feature Importance (Only for Random Forest)
    if model_name == 'Random Forest':
        feature_importances = model.feature_importances_
        features = X.columns
        importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
        importance_df = importance_df.sort_values(by='Importance', ascending=False)
        plt.figure(figsize=(8, 6))
        plt.barh(importance_df['Feature'], importance_df['Importance'])
        plt.title('Feature Importance (Random Forest)')
        plt.xlabel('Importance')
        plt.ylabel('Feature')
        plt.show()
