In [None]:
# ASSIGNMENT-1:
# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 2. Load Dataset
# Example dataset (you can replace with your CSV file)
# Let's create a small height-weight dataset for demonstration
data = {
    "Height": [150, 152, 155, 160, 165, 170, 172, 175, 180, 185],
    "Weight": [50, 52, 53, 55, 65, 70, 72, 75, 78, 85]
}
df = pd.DataFrame(data)

# 3. Explore Dataset
print("First five rows:")
print(df.head())
print("\nDataset info:")
print(df.info())

# 4. Visualize Data
plt.scatter(df["Height"], df["Weight"])
plt.xlabel("Height (cm)")
plt.ylabel("Weight (kg)")
plt.title("Height vs Weight Scatter Plot")
plt.show()

# 5. Split Data
X = df[["Height"]]   # independent variable
y = df["Weight"]     # dependent variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train Model
model = LinearRegression()
model.fit(X_train, y_train)

# 7. Predict
y_pred = model.predict(X_test)

# 8. Evaluate Model
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

# Print model coefficients
print("Intercept (b0):", model.intercept_)
print("Slope (b1):", model.coef_[0])

# 9. Plot Regression Line
plt.scatter(X, y)
plt.plot(X, model.predict(X), color='red')  # best-fit line
plt.xlabel("Height (cm)")
plt.ylabel("Weight (kg)")
plt.title("Linear Regression Line")
plt.show()


In [None]:
# ASSIGNMENT-2:

# 1. Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 2. Load Dataset
# Here we use the Boston Housing dataset from sklearn / openml (commonly used on Kaggle).
from sklearn.datasets import fetch_openml
boston = fetch_openml(name="boston", version=1, as_frame=True)

df = boston.frame  # DataFrame including features + target
df.columns = list(boston.feature_names) + ["PRICE"]  # rename target column

# 3. Explore Dataset
print("Shape:", df.shape)
print(df.head())
print(df.describe())

# 4. (Optional) Visualize relationships - pairplot or correlation heatmap
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation matrix")
plt.show()

# 5. Choose features (>= 5) and target
# E.g., pick 5–7 features that likely influence price
features = ["RM", "LSTAT", "PTRATIO", "INDUS", "NOX", "AGE", "DIS"]
X = df[features]
y = df["PRICE"]

# 6. Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 7. Train Multiple Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# 8. Predict on test set
y_pred = model.predict(X_test)

# 9. Report results: coefficients, intercept, MSE, R²
print("Intercept:", model.intercept_)
coeff_df = pd.DataFrame({
    "Feature": features,
    "Coefficient": model.coef_
})
print(coeff_df)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

# 10. Plot Actual vs Predicted
plt.scatter(y_test, y_pred, color='blue')
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         color='red', lw=2)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted House Price")
plt.show()


In [None]:
# ASSIGNMENT-3:
# 1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 2. Load Dataset (Iris)
from sklearn.datasets import load_iris
data = load_iris()

df = pd.DataFrame(data.data, columns=data.feature_names)
df['Species'] = data.target

print(df.head())

# 3. Features and Target
X = df.drop("Species", axis=1)
y = df["Species"]

# 4. Train-Test Split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 5. Train Model 1: Logistic Regression
log_reg = LogisticRegression(max_iter=200)
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

# 6. Train Model 2: Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# 7. Compare Accuracy
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

# 8. Detailed Evaluation
print("\nClassification Report (Logistic Regression):")
print(classification_report(y_test, y_pred_lr))

print("\nClassification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf))


In [None]:
# ASSIGNMENT-4:
# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc

# 2. Load UCI Banknote Dataset
# You can download from: https://archive.ics.uci.edu/ml/datasets/banknote+authentication
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt"

df = pd.read_csv(url, header=None)
df.columns = ["variance","skewness","curtosis","entropy","class"]

print(df.head())

# 3. Features and Target
X = df.drop("class", axis=1)
y = df["class"]

# 4. Train-Test Split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 5. Logistic Regression Model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)
y_prob_lr = log_reg.predict_proba(X_test)[:, 1]

# 6. Decision Tree Model
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)
y_prob_tree = tree.predict_proba(X_test)[:, 1]

# 7. Confusion Matrix
cm_lr = confusion_matrix(y_test, y_pred_lr)
cm_tree = confusion_matrix(y_test, y_pred_tree)

print("Confusion Matrix (Logistic Regression):\n", cm_lr)
print("\nConfusion Matrix (Decision Tree):\n", cm_tree)

# 8. ROC and AUC
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_prob_lr)
auc_lr = auc(fpr_lr, tpr_lr)

fpr_tree, tpr_tree, _ = roc_curve(y_test, y_prob_tree)
auc_tree = auc(fpr_tree, tpr_tree)

print("\nAUC (Logistic Regression):", auc_lr)
print("AUC (Decision Tree):", auc_tree)

# 9. Plot ROC Curves
plt.figure(figsize=(8,6))
plt.plot(fpr_lr, tpr_lr, label=f"Logistic Regression (AUC = {auc_lr:.2f})")
plt.plot(fpr_tree, tpr_tree, label=f"Decision Tree (AUC = {auc_tree:.2f})")
plt.plot([0,1], [0,1], "k--")  # baseline
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend()
plt.show()

# 10. Classification Reports
print("\nLogistic Regression Report:")
print(classification_report(y_test, y_pred_lr))

print("\nDecision Tree Report:")
print(classification_report(y_test, y_pred_tree))


In [None]:
# ASSIGNMENT-5:

# 1. Import Libraries
import pandas as pd
import numpy as np

from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from imblearn.over_sampling import SMOTE

# 2. Load Wine Dataset
wine = load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = pd.Series(wine.target, name="Class")

print("Class Distribution Before SMOTE:")
print(y.value_counts())

# 3. Train-Test Split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4. Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 5. Handle Imbalance using SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_scaled, y_train)

print("\nClass Distribution After SMOTE:")
print(y_train_res.value_counts())

# 6. Model 1: Logistic Regression (Multiclass)
log_reg = LogisticRegression(max_iter=300, multi_class='multinomial')
log_reg.fit(X_train_res, y_train_res)
y_pred_lr = log_reg.predict(X_test_scaled)

# 7. Model 2: Random Forest Classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_res, y_train_res)
y_pred_rf = rf.predict(X_test)

# 8. Evaluation Metrics

print("\n================ LOGISTIC REGRESSION ================")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report:")
print(classification_report(y_test, y_pred_lr))

print("\n================ RANDOM FOREST ======================")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))

# Confusion matrices
print("\nConfusion Matrix (Logistic Regression):")
print(confusion_matrix(y_test, y_pred_lr))

print("\nConfusion Matrix (Random Forest):")
print(confusion_matrix(y_test, y_pred_rf))


In [None]:
# ASSIGNMENT-6:


# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

# 2. Load Dataset (Mall Customers)
url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/mall_customers.csv"
df = pd.read_csv(url)

print(df.head())

# 3. Select Features for Clustering
X = df[["Annual_Income", "Spending_Score"]]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ----------------------------
# 4. Elbow Method
# ----------------------------
inertia_vals = []
K_range = range(2, 11)

for k in K_range:
    model = KMeans(n_clusters=k, random_state=42)
    model.fit(X_scaled)
    inertia_vals.append(model.inertia_)

# Plot Elbow Curve
plt.figure(figsize=(6,4))
plt.plot(K_range, inertia_vals, marker='o')
plt.title("Elbow Method - Optimal K")
plt.xlabel("Number of clusters")
plt.ylabel("Inertia")
plt.show()

# ----------------------------
# 5. Silhouette Score Method
# ----------------------------
silhouette_vals = []

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels)
    silhouette_vals.append(score)

# Plot Silhouette Scores
plt.figure(figsize=(6,4))
plt.plot(K_range, silhouette_vals, marker='o')
plt.title("Silhouette Score Method")
plt.xlabel("Number of clusters")
plt.ylabel("Silhouette Score")
plt.show()

# Choose optimal K (usually 5)
k_optimal = 5
kmeans = KMeans(n_clusters=k_optimal, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

df["Cluster"] = clusters

# ----------------------------
# 6. Visualize Clusters
# ----------------------------
plt.figure(figsize=(7,5))
plt.scatter(df["Annual_Income"], df["Spending_Score"],
            c=df["Cluster"], cmap="viridis", s=60)

centers = scaler.inverse_transform(kmeans.cluster_centers_)
plt.scatter(centers[:,0], centers[:,1], 
            c='red', s=200, marker='X', label="Centroids")

plt.title("Customer Segmentation using K-Means")
plt.xlabel("Annual Income")
plt.ylabel("Spending Score")
plt.legend()
plt.show()


In [None]:
# ASSIGNMENT-7:


# 1. Import Libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# 2. Load High-Dimensional Dataset
digits = load_digits()
X = digits.data  # 64 features
y = digits.target

print("Original Shape:", X.shape)

# 3. Standardize Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 4. PCA: Reduce to 2 components (for visualization)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

print("Shape After PCA:", X_pca.shape)

# 5. Apply K-Means
k = 10  # digits dataset → 10 clusters expected
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X_pca)

# 6. Evaluate Clustering Quality
inertia = kmeans.inertia_
sil_score = silhouette_score(X_pca, clusters)

print("Inertia:", inertia)
print("Silhouette Score:", sil_score)

# 7. Visualization of PCA Clusters
plt.figure(figsize=(7, 5))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap="tab10", s=20)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
            color='red', marker='X', s=200, label='Centroids')
plt.title("PCA + K-Means Clustering")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend()
plt.show()


In [None]:
# ASSIGNMENT-8:


# 1. Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import IsolationForest
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

# 2. Load Dataset
url = "https://raw.githubusercontent.com/plotly/datasets/master/creditcard.csv"
df = pd.read_csv(url)

print(df.head())

# 3. Feature Scaling for 'Amount'
scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(df[['Amount']])

# Drop Time column (not useful)
df = df.drop('Time', axis=1)

# 4. Features
X = df.drop('Class', axis=1)
y = df['Class']

# 5. Isolation Forest Model
iso = IsolationForest(
    n_estimators=100,
    contamination=0.01,   # approx fraud 1%
    random_state=42
)
iso.fit(X)

# 6. Predictions
y_pred = iso.predict(X)
# IsolationForest returns:
# 1 = normal, -1 = anomaly → convert to 0/1
y_pred = np.where(y_pred == -1, 1, 0)

# 7. Evaluation
cm = confusion_matrix(y, y_pred)
print("Confusion Matrix:\n", cm)

print("\nClassification Report:")
print(classification_report(y, y_pred))

# 8. Visualization of Fraud vs Normal
plt.figure(figsize=(6,5))
sns.countplot(x=y_pred)
plt.title("Detected Anomalies by Isolation Forest")
plt.xlabel("Predicted Class (0=Normal, 1=Fraud)")
plt.ylabel("Count")
plt.show()

# 9. 2D Visualization (Amount vs PCA component V2)
plt.figure(figsize=(7,5))
plt.scatter(df['V2'], df['Amount'], c=y_pred, cmap='coolwarm', s=5)
plt.title("Fraud Detection – Isolation Forest")
plt.xlabel("V2")
plt.ylabel("Amount")
plt.show()
