In [11]:
# Fundamental Libraries 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
# Scikit-learn Modules 
from sklearn import datasets  # To load built-in datasets 
from sklearn.model_selection import train_test_split  # To split data 
from sklearn.linear_model import LinearRegression, LogisticRegression  # Our models 
from sklearn.preprocessing import StandardScaler  # To scale features 
from sklearn import metrics  # To evaluate models 
# Magic command for inline plotting 
%matplotlib inline 
sns.set_style("whitegrid") # Set a nice plot style

# Load the Diabetes dataset 
diabetes = datasets.load_diabetes() 
print(diabetes.DESCR) # Always read the description first! 
# Create a DataFrame for easier analysis 
df_diabetes = pd.DataFrame(diabetes.data, columns=diabetes.feature_names) 
df_diabetes['target'] = diabetes.target # This is our label: disease progression 
# Check the basic structure of the data 
print("Dataset Shape:", df_diabetes.shape) 
df_diabetes.head()

# Get a quick statistical summary 
df_diabetes.describe()

# Let's choose one feature to explore: 'bmi' (body mass index) 
plt.figure(figsize=(8, 5)) 
sns.scatterplot(data=df_diabetes, x='bmi', y='target', alpha=0.6) 
plt.title('Diabetes Progression vs. BMI') 
plt.xlabel('BMI (standardized)') 
plt.ylabel('Disease Progression') 
plt.show() 

# 1. Define Features (X) and Label (y) 
# Let's start with just one feature: 'bmi' 
X = df_diabetes[['bmi']]  # Note: Double brackets are needed to keep X as a 2D structure 
y = df_diabetes['target'] 
# 2. Split the Data (80% train, 20% test) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
print(f"Training set size: {X_train.shape[0]}") 
print(f"Test set size: {X_test.shape[0]}") 
# 3. Create and Train the Linear Regression Model 
lin_model = LinearRegression() 
lin_model.fit(X_train, y_train) # This is where the learning happens! 
# 4. Let's see what the model learned 
print("Model Intercept (b₀):", lin_model.intercept_) 
print("Model Coefficient (b₁ for 'bmi'):", lin_model.coef_[0]) 

# 5. Use the trained model to make predictions on the test set 
y_pred = lin_model.predict(X_test) 
# 6. Evaluate the Model's Performance 
# Create a DataFrame to compare actual vs. predicted values for the first few test samples 
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}) 
results.head(10)

print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_pred)) 
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_pred)) 
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) 
print('R² Score:', metrics.r2_score(y_test, y_pred)) 

# Plot the test data and the regression line 
plt.figure(figsize=(8, 5)) 
plt.scatter(X_test, y_test, color='blue', alpha=0.6, label='Test Data') 
plt.plot(X_test, y_pred, color='red', linewidth=2, label='Regression Line') 
plt.title('Linear Regression: Actual vs. Predicted (Test Set)') 
plt.xlabel('BMI') 
plt.ylabel('Disease Progression') 
plt.legend() 
plt.show()

# Load the Wine dataset 
wine = datasets.load_wine() 
print(wine.DESCR) 
# Create a DataFrame 
df_wine = pd.DataFrame(wine.data, columns=wine.feature_names) 
df_wine['target'] = wine.target 
df_wine['class'] = df_wine['target'].map({0: wine.target_names[0], 1: wine.target_names[1], 2: 
wine.target_names[2]}) 
# Check the data 
print("Dataset Shape:", df_wine.shape) 
df_wine.head()

# Check the distribution of the target variable 
df_wine['class'].value_counts().plot(kind='bar') 
plt.title('Distribution of Wine Classes') 
plt.ylabel('Count') 
plt.show()

# 1. Define Features (X) and Label (y). Let's use all features. 
X = df_wine[wine.feature_names] 
y = df_wine['target'] 
# 2. Split the Data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, 
stratify=y) 
# 3. Standardize the Features 
scaler = StandardScaler() 
X_train_scaled = scaler.fit_transform(X_train) 
X_test_scaled = scaler.transform(X_test) # Important: Use the scaler from the training data 
# 4. Create and Train the Logistic Regression Model 
# We set multi_class='ovr' (One-vs-Rest) for multi-class classification 
log_model = LogisticRegression(random_state=42, multi_class='ovr', max_iter=1000) 
log_model.fit(X_train_scaled, y_train)

# 5. Make predictions on the scaled test set 
y_pred = log_model.predict(X_test_scaled) 
y_pred_proba = log_model.predict_proba(X_test_scaled) # Get the probabilities 
# 6. Evaluate the Classifier 
print("Test Accuracy:", metrics.accuracy_score(y_test, y_pred)) 
print("\nClassification Report:\n", metrics.classification_report(y_test, y_pred, 
target_names=wine.target_names))

# Create a visually appealing confusion matrix 
conf_matrix = metrics.confusion_matrix(y_test, y_pred) 
plt.figure(figsize=(8, 6)) 
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
xticklabels=wine.target_names, yticklabels=wine.target_names) 
plt.title('Confusion Matrix for Wine Classification') 
plt.ylabel('True Label') 
plt.xlabel('Predicted Label') 
plt.show() 

# Let's look at the first test sample in detail 
sample_index = 0 
print("Features for sample:", X_test.iloc[sample_index].values) 
print("Actual class:", wine.target_names[y_test.iloc[sample_index]]) 
print("Predicted class:", wine.target_names[y_pred[sample_index]]) 
print("Predicted probabilities for each class:") 
for i, class_name in enumerate(wine.target_names): 
 print(f"  {class_name}: {y_pred_proba[sample_index][i]:.4f}")

IndentationError: expected an indented block after 'for' statement on line 131 (101490916.py, line 132)