<a href="https://colab.research.google.com/github/S-Devisri01/Python-colab/blob/main/Python_gen_ai_day_6%2C7_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
import statsmodels.api as sm

# Load dataset
df = pd.read_csv("/content/employee_salary.csv")
X = df[["Experience", "EducationLevel", "Age"]]
y = df["Salary"]

# Cross-Validation with Ridge
ridge = Ridge(alpha=1.0)
scores = cross_val_score(
    ridge,
    X,
    y,
    cv=5,
    scoring="r2"
)
print("Cross-Validation R² scores:", scores)
print("Average CV Score:", np.mean(scores))

# Grid Search for best alpha
params = {
    "alpha": [0.01, 0.1, 1, 10, 100]
}
grid = GridSearchCV(
    Ridge(),
    params,
    cv=5,
    scoring="r2"
)
grid.fit(X, y)
print("Best Alpha:", grid.best_params_)
print("Best CV Score:", grid.best_score_)

# Adjusted R² (manual)
best_ridge = Ridge(alpha=grid.best_params_["alpha"])
best_ridge.fit(X, y)
y_pred = best_ridge.predict(X)
r2 = r2_score(y, y_pred)
n, k = X.shape
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - k - 1)
print("R²:", r2)
print("Adjusted R²:", adj_r2)

# AIC & BIC using statsmodels
X_const = sm.add_constant(X)   # add intercept
ols_model = sm.OLS(y, X_const).fit()
print("AIC:", ols_model.aic)
print("BIC:", ols_model.bic)


Cross-Validation R² scores: [0.89006932 0.89772997 0.90819285 0.87589803 0.88310694]
Average CV Score: 0.8909994215752061
Best Alpha: {'alpha': 1}
Best CV Score: 0.8909994215752061
R²: 0.8925838993553217
Adjusted R²: 0.8922603568835004
AIC: 19848.681846282172
BIC: 19868.3128673981


In [3]:
# Model Diagnosis - Accuracy

from sklearn.metrics import accuracy_score

# Actual results
actual = ["Pass", "Fail", "Pass", "Pass", "Fail"]

# Model predictions
predicted = ["Pass", "Fail", "Fail", "Pass", "Pass"]

# Calculate accuracy
accuracy = accuracy_score(actual, predicted)

print("Accuracy:", accuracy)

Accuracy: 0.6


In [4]:
# Logistic Regression Example
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Create example dataset
data = {
    "Age": [22, 25, 47, 52, 46],
    "Salary": [20000, 30000, 50000, 60000, 80000],
    "Buy": [0, 0, 1, 1, 1]   # 0 = No, 1 = Yes
}
df = pd.DataFrame(data)

# Split features and target
X = df[["Age", "Salary"]]   # Features
y = df["Buy"]               # Target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42
)

# Train Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict probabilities
probs = model.predict_proba(X_test)
print("Probabilities:\n", probs)

# Predict class labels (0/1)
predictions = model.predict(X_test)
print("Predictions:\n", predictions)


Probabilities:
 [[0.99669013 0.00330987]
 [0.         1.        ]]
Predictions:
 [0 1]


In [5]:
# Logistic Regression - Student Pass Prediction
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# Create dataset
data = {
    "StudyHours": [2, 4, 6, 8, 1, 5, 7, 3],
    "Attendance": [60, 65, 75, 90, 50, 70, 85, 55],
    "Pass": [0, 0, 1, 1, 0, 1, 1, 0]   # 0 = Fail, 1 = Pass
}
df = pd.DataFrame(data)

# Features & Target
X = df[["StudyHours", "Attendance"]]
y = df["Pass"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42
)

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)
print("Predicted Classes:", y_pred)
print("Predicted Probabilities:\n", y_prob)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", cm)


Predicted Classes: [0 1]
Predicted Probabilities:
 [[0.76522135 0.23477865]
 [0.2353578  0.7646422 ]]
Accuracy: 1.0
Confusion Matrix:
 [[1 0]
 [0 1]]


In [9]:
# Logistic Regression - Loan Approval Prediction (Fixed)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# Create dataset
data = {
    "Income": [30000, 50000, 40000, 60000, 25000, 70000, 45000, 80000],
    "CreditScore": [650, 720, 680, 750, 600, 780, 700, 820],
    "LoanAmount": [100000, 150000, 120000, 200000, 90000, 250000, 140000, 300000],
    "Approved": [0, 1, 0, 1, 0, 1, 1, 1]   # 0 = No, 1 = Yes
}
df = pd.DataFrame(data)

# Features & Target
X = df[["Income", "CreditScore", "LoanAmount"]]
y = df["Approved"]

# Stratified Train-test split (IMPORTANT FIX)
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)
print("Predicted Classes:", y_pred)
print("Predicted Probabilities:\n", y_prob)

# Evaluation (FIXED confusion matrix)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", cm)


Predicted Classes: [1 0]
Predicted Probabilities:
 [[0.         1.        ]
 [0.98998051 0.01001949]]
Accuracy: 1.0
Confusion Matrix:
 [[1 0]
 [0 1]]


In [10]:
# Confusion Matrix & Metrics
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

# 1. Load dataset
df = pd.read_csv("/content/student_marks_dataset.csv")
print(df.head())

# 2. Features & Target
X = df[["StudyHours", "Attendance"]]

# Convert Pass = 1, Fail = 0
y = (df["Result"] == "Pass").astype(int)

# 3. Train-test split (70% / 30%)
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

# 4. Train Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 5. Predict probabilities
probs = model.predict_proba(X_test)
print("Probabilities:\n", probs)

# 6. Predict classes
y_pred = model.predict(X_test)

# 7. Confusion Matrix & Metrics
cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Confusion Matrix:\n", cm)
print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1 Score:", f1)


FileNotFoundError: [Errno 2] No such file or directory: '/content/student_marks_dataset.csv'