# Wine Cultivar Origin Prediction System
## Model Development

This notebook builds a multiclass classification model to predict wine cultivar origin based on chemical properties.

### Step 1: Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
import joblib
import os

### Step 2: Load the Wine Dataset

In [None]:
# Load the Wine dataset from sklearn
wine_data = load_wine()

# Display dataset information
print("Dataset Description:")
print(wine_data.DESCR[:500])  # Print first 500 characters
print("\nFeature names:", wine_data.feature_names)
print("Target names:", wine_data.target_names)

### Step 3: Convert to Pandas DataFrame

In [None]:
# Create DataFrame with all features
df = pd.DataFrame(data=wine_data.data, columns=wine_data.feature_names)

# Add target variable
df['cultivar'] = wine_data.target

# Display first few rows
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
df.head()

### Step 4: Check and Handle Missing Values

In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# Check data types
print("\nData types:")
print(df.dtypes)

# Basic statistics
print("\nBasic statistics:")
df.describe()

### Step 5: Select ONLY the Six Specified Features

In [None]:
# Define the six required features
selected_features = [
    'alcohol',
    'malic_acid',
    'ash',
    'alcalinity_of_ash',
    'magnesium',
    'flavanoids'
]

# Extract features (X) and target (y)
X = df[selected_features]
y = df['cultivar']

print("Selected features:", selected_features)
print("\nFeature matrix shape:", X.shape)
print("Target variable shape:", y.shape)
print("\nTarget distribution:")
print(y.value_counts().sort_index())

### Step 6: Split Dataset into Training and Test Sets (80/20)

In [None]:
# Split the data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])
print("\nTraining set target distribution:")
print(y_train.value_counts().sort_index())
print("\nTest set target distribution:")
print(y_test.value_counts().sort_index())

### Step 7: Apply Feature Scaling using StandardScaler

In [None]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling completed.")
print("\nScaled training data shape:", X_train_scaled.shape)
print("Scaled test data shape:", X_test_scaled.shape)
print("\nMean of scaled training features (should be ~0):")
print(np.mean(X_train_scaled, axis=0))
print("\nStd of scaled training features (should be ~1):")
print(np.std(X_train_scaled, axis=0))

### Step 8: Train Logistic Regression Model

In [None]:
# Initialize Logistic Regression model
# max_iter increased to ensure convergence
model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model
print("Training Logistic Regression model...")
model.fit(X_train_scaled, y_train)
print("Model training completed!")

# Display model parameters
print("\nModel coefficients shape:", model.coef_.shape)
print("Model intercept:", model.intercept_)

### Step 9: Make Predictions

In [None]:
# Make predictions on training and test sets
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

print("Predictions completed.")
print("\nFirst 10 test predictions:", y_test_pred[:10])
print("First 10 actual test values:", y_test.values[:10])

### Step 10: Evaluate Model Performance

In [None]:
# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("=" * 60)
print("MODEL PERFORMANCE EVALUATION")
print("=" * 60)
print(f"\nTraining Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")

# Calculate precision, recall, and F1-score (macro average)
precision = precision_score(y_test, y_test_pred, average='macro')
recall = recall_score(y_test, y_test_pred, average='macro')
f1 = f1_score(y_test, y_test_pred, average='macro')

print(f"\nPrecision (macro): {precision:.4f}")
print(f"Recall (macro): {recall:.4f}")
print(f"F1-Score (macro): {f1:.4f}")

# Display full classification report
print("\n" + "=" * 60)
print("DETAILED CLASSIFICATION REPORT")
print("=" * 60)
target_names = ['Cultivar 0', 'Cultivar 1', 'Cultivar 2']
print(classification_report(y_test, y_test_pred, target_names=target_names))

### Step 11: Save the Trained Model and Scaler

In [None]:
# Save the trained model
model_filename = 'wine_cultivar_model.pkl'
joblib.dump(model, model_filename)
print(f"Model saved as: {model_filename}")

# Save the fitted scaler
scaler_filename = 'scaler.pkl'
joblib.dump(scaler, scaler_filename)
print(f"Scaler saved as: {scaler_filename}")

# Verify files were created
if os.path.exists(model_filename) and os.path.exists(scaler_filename):
    print("\n✓ Both model and scaler files saved successfully!")
    print(f"  - Model file size: {os.path.getsize(model_filename)} bytes")
    print(f"  - Scaler file size: {os.path.getsize(scaler_filename)} bytes")
else:
    print("\n✗ Error: Files were not saved properly.")

### Step 12: Test Model Loading (Verification)

In [None]:
# Load the saved model and scaler to verify they work
loaded_model = joblib.load(model_filename)
loaded_scaler = joblib.load(scaler_filename)

print("Model and scaler loaded successfully!")

# Test with a sample prediction
sample_data = X_test.iloc[0:1]  # Take first test sample
sample_scaled = loaded_scaler.transform(sample_data)
sample_prediction = loaded_model.predict(sample_scaled)

print("\nVerification Test:")
print(f"Sample input: {sample_data.values[0]}")
print(f"Predicted cultivar: {sample_prediction[0]}")
print(f"Actual cultivar: {y_test.iloc[0]}")
print("\n✓ Model is ready for deployment!")

---
## Summary

This notebook successfully:
1. Loaded the Wine dataset from sklearn
2. Converted it to a Pandas DataFrame
3. Checked for missing values (none found)
4. Selected the 6 specified features
5. Split data into 80/20 train/test sets
6. Applied StandardScaler for feature scaling
7. Trained a Logistic Regression model
8. Evaluated performance with accuracy, precision, recall, and F1-score
9. Saved both the model and scaler for deployment

The model is now ready to be used in the Flask web application!