# <U>**Mini Project 6**</U>

## Model Evaluation, Optimization & Deployment

### Objective:
- Improve the trained model’s performance through hyperparameter tuning, compare multiple algorithms, and prepare it for deployment.

### Step 1 — Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib  # For saving the model

### Step 2 — Load Dataset

In [5]:
df = pd.read_csv("cleaned_data_wrangling_project.CSV")


In [6]:
# Quick look at the data
print("First 5 rows of the dataset:")
print(df.head())

print("\nDataset Info:")
print(df.info())

First 5 rows of the dataset:
  customer_name  order_date        sales  quantity_ordered returned?  \
0         Alice  2023-01-15  1000.000000               2.0       Yes   
1           Bob         NaN   850.000000               3.0        No   
2         alice         NaN   592.857143               1.0       Yes   
3       Charlie         NaN   550.000000               1.0       Yes   
4         David         NaN   900.000000               4.0        No   

   high_value_order  
0              True  
1              True  
2              True  
3              True  
4              True  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customer_name     10 non-null     object 
 1   order_date        1 non-null      object 
 2   sales             10 non-null     float64
 3   quantity_ordered  10 non-null     float64
 4   ret

In [7]:
print("\nMissing values in dataset:")
print(df.isnull().sum())


Missing values in dataset:
customer_name       0
order_date          9
sales               0
quantity_ordered    0
returned?           0
high_value_order    0
dtype: int64


In [18]:
from sklearn.preprocessing import LabelEncoder

# Make a copy to avoid modifying original dataset
df_encoded = df.copy()

# Identify categorical columns
categorical_cols = df_encoded.select_dtypes(include=['object']).columns

# Encode each categorical column
le = LabelEncoder()
for col in categorical_cols:
    df_encoded[col] = le.fit_transform(df_encoded[col])

df_encoded.head()


Unnamed: 0,customer_name,order_date,sales,quantity_ordered,returned?,high_value_order
0,0,0,1000.0,2.0,1,True
1,1,1,850.0,3.0,0,True
2,9,1,592.857143,1.0,1,True
3,2,1,550.0,1.0,1,True
4,3,1,900.0,4.0,0,True


### Step 3 — Train/Test Split

In [23]:
# Features (exclude target)
X = df_encoded.drop('high_value_order', axis=1)

# Target
y = df_encoded['high_value_order']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [24]:
print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

Training data shape: (8, 5)
Testing data shape: (2, 5)


### Step 4 — Model Training

In [25]:
model = LogisticRegression()
model.fit(X_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


### Step 5 — Model Evaluation

In [27]:
y_pred = model.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

        True       1.00      1.00      1.00         2

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



### Step 6 – Predict on New Data & Interpret Results

In [30]:
# Define features for training (exclude non-numeric columns)
features = ['sales', 'quantity_ordered', 'returned?']

# Encode 'returned?' in training set
X = df[features].copy()
X['returned?'] = X['returned?'].map({'No': 0, 'Yes': 1})

y = df['high_value_order']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

# Now for new data — keep same features & encoding
sample_data = pd.DataFrame({
    'sales': [200.0, 80.0],
    'quantity_ordered': [5, 2],
    'returned?': ['No', 'Yes']
})

sample_data['returned?'] = sample_data['returned?'].map({'No': 0, 'Yes': 1})

predictions = model.predict(sample_data)
pred_probs = model.predict_proba(sample_data)

print("Predictions:", predictions)
print("Prediction Probabilities:\n", pred_probs)


Predictions: [False False]
Prediction Probabilities:
 [[1.00000000e+00 5.41221224e-18]
 [1.00000000e+00 6.74467900e-25]]


In [31]:
# New data for prediction
new_data = pd.DataFrame({
    'sales': [150.0, 50.0],
    'quantity_ordered': [4, 1],
    'returned?': ['No', 'Yes']
})

# Encode 'returned?' same as training
new_data['returned?'] = new_data['returned?'].map({'No': 0, 'Yes': 1})

# Predict
predictions = model.predict(new_data)
pred_probs = model.predict_proba(new_data)

# Output
for i in range(len(new_data)):
    print(f"Order {i+1}: High Value = {predictions[i]}, Probability = {pred_probs[i][1]:.2f}")


Order 1: High Value = False, Probability = 0.00
Order 2: High Value = False, Probability = 0.00


### Save the trained model

In [33]:
joblib.dump(model, 'high_value_order_model.pkl')

# Save the encoder if needed
joblib.dump(le, 'label_encoder.pkl')

print("Model and encoder saved successfully.")

# ---------------------------
# Later, load and reuse
# ---------------------------
loaded_model = joblib.load('high_value_order_model.pkl')
loaded_encoder = joblib.load('label_encoder.pkl')

# Example prediction with loaded model
new_data = pd.DataFrame({
    'sales': [200.0],
    'quantity_ordered': [5],
    'returned?': ['No']
})

# Encode 'returned?'
new_data['returned?'] = loaded_encoder.transform(new_data['returned?'])

prediction = loaded_model.predict(new_data)
print("Prediction from loaded model:", prediction[0])

Model and encoder saved successfully.
Prediction from loaded model: False
