In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
import joblib  # For saving/loading model
import os      # For file existence check

In [3]:

# --- 1. Load dataset ---
try:
    df = pd.read_excel("new.xlsx")
except FileNotFoundError:
    print("Error: The file 'new.xlsx' was not found.")
    exit()

# Drop unnecessary columns
if "Unnamed: 0" in df.columns:
    df = df.drop(columns=["Unnamed: 0"])

# Copy original dataframe
df_original = df.copy()

In [None]:
# --- 2. Handle categorical variables ---
cat_cols = df.select_dtypes(include=['object']).columns
print("Categorical Columns:", list(cat_cols))

label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

In [None]:
# --- 3. Split Features & Target ---
# Assuming 'status' is the target column
X = df.drop(columns=["status"])
y = df["status"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Handle missing values
imputer = SimpleImputer(strategy="constant", fill_value=0)
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [None]:
# --- 4. Train Gradient Boosting Classifier ---
print("\n--- Model Training: Gradient Boosting Classifier ---")
model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.0092,
    subsample=0.84555,
    max_depth=10,
    random_state=42
)
model.fit(X_train, y_train)
print("✅ Gradient Boosting Classifier trained.")


In [None]:
# --- 5. Evaluate ---
y_pred = model.predict(X_test)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# --- 6. Save the trained model and LabelEncoders ---
print("\n--- Saving the trained model and LabelEncoders ---")
joblib.dump(model, "mango_model.pkl")
joblib.dump(label_encoders, "mango_label_encoders.pkl")
print("✅ Model and LabelEncoders saved successfully as 'mango_model.pkl' and 'mango_label_encoders.pkl'.")

In [None]:
# --- 7. Load the saved model and LabelEncoders ---
print("\n--- Loading the saved model and LabelEncoders ---")
if os.path.exists("mango_model.pkl") and os.path.exists("mango_label_encoders.pkl"):
    loaded_model = joblib.load("mango_model.pkl")
    loaded_label_encoders = joblib.load("mango_label_encoders.pkl")
    print("✅ Model and LabelEncoders loaded successfully.")
else:
    print("Error: Saved model or encoder files not found.")
    exit()


In [None]:
# --- 8. Test with new user input ---
print("\n--- Testing with User Input ---")
user_input_data = {}

for col in X.columns:
    if col in cat_cols:
        le = loaded_label_encoders[col]
        options = le.classes_
        print(f"\nSelect a value for '{col}':")
        for i, option in enumerate(options):
            print(f"[{i}] {option}")
        while True:
            try:
                choice_index = int(input("Enter the number corresponding to your choice: "))
                if 0 <= choice_index < len(options):
                    value = options[choice_index]
                    break
                else:
                    print("Invalid choice. Please enter a valid number.")
            except ValueError:
                print("Invalid input. Please enter a number.")
    else:
        value = input(f"Enter value for '{col}': ")
    user_input_data[col] = [value]

# Convert input to DataFrame
user_df = pd.DataFrame(user_input_data)

# Encode categorical columns
for col in cat_cols:
    if col in user_df.columns:
        le = loaded_label_encoders[col]
        user_df[col] = le.transform(user_df[col])

# Ensure numeric format
user_df = user_df.apply(pd.to_numeric)

# Prediction
prediction = loaded_model.predict(user_df)

print("\nPrediction for your input:")
print(f"The predicted status is: {prediction[0]}")


# import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
import joblib  # For saving/loading model
import os      # For file existence check

# --- 1. Load dataset ---
try:
    df = pd.read_excel("new.xlsx")
except FileNotFoundError:
    print("Error: The file 'new.xlsx' was not found.")
    exit()

# Drop unnecessary columns
if "Unnamed: 0" in df.columns:
    df = df.drop(columns=["Unnamed: 0"])

# Copy original dataframe
df_original = df.copy()

# --- 2. Handle categorical variables ---
cat_cols = df.select_dtypes(include=['object']).columns
print("Categorical Columns:", list(cat_cols))

label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# --- 3. Split Features & Target ---
# Assuming 'status' is the target column
X = df.drop(columns=["status"])
y = df["status"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Handle missing values
imputer = SimpleImputer(strategy="constant", fill_value=0)
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# --- 4. Train Gradient Boosting Classifier ---
print("\n--- Model Training: Gradient Boosting Classifier ---")
model = GradientBoostingClassifier(
    n_estimators=500,
    learning_rate=0.0092,
    subsample=0.84555,
    max_depth=10,
    random_state=42
)
model.fit(X_train, y_train)
print("✅ Gradient Boosting Classifier trained.")

# --- 5. Evaluate ---
y_pred = model.predict(X_test)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# --- 6. Save the trained model and LabelEncoders ---
print("\n--- Saving the trained model and LabelEncoders ---")
joblib.dump(model, "mango_model.pkl")
joblib.dump(label_encoders, "mango_label_encoders.pkl")
print("✅ Model and LabelEncoders saved successfully as 'mango_model.pkl' and 'mango_label_encoders.pkl'.")

# --- 7. Load the saved model and LabelEncoders ---
print("\n--- Loading the saved model and LabelEncoders ---")
if os.path.exists("mango_model.pkl") and os.path.exists("mango_label_encoders.pkl"):
    loaded_model = joblib.load("mango_model.pkl")
    loaded_label_encoders = joblib.load("mango_label_encoders.pkl")
    print("✅ Model and LabelEncoders loaded successfully.")
else:
    print("Error: Saved model or encoder files not found.")
    exit()

# --- 8. Test with new user input ---
print("\n--- Testing with User Input ---")
user_input_data = {}

for col in X.columns:
    if col in cat_cols:
        le = loaded_label_encoders[col]
        options = le.classes_
        print(f"\nSelect a value for '{col}':")
        for i, option in enumerate(options):
            print(f"[{i}] {option}")
        while True:
            try:
                choice_index = int(input("Enter the number corresponding to your choice: "))
                if 0 <= choice_index < len(options):
                    value = options[choice_index]
                    break
                else:
                    print("Invalid choice. Please enter a valid number.")
            except ValueError:
                print("Invalid input. Please enter a number.")
    else:
        value = input(f"Enter value for '{col}': ")
    user_input_data[col] = [value]

# Convert input to DataFrame
user_df = pd.DataFrame(user_input_data)

# Encode categorical columns
for col in cat_cols:
    if col in user_df.columns:
        le = loaded_label_encoders[col]
        user_df[col] = le.transform(user_df[col])

# Ensure numeric format
user_df = user_df.apply(pd.to_numeric)

# Prediction
prediction = loaded_model.predict(user_df)

print("\nPrediction for your input:")
print(f"The predicted status is: {prediction[0]}")


In [None]:
# --- 7. Load the saved model and LabelEncoders ---
print("\n--- Loading the saved model and LabelEncoders ---")
if os.path.exists("mango_model.pkl") and os.path.exists("mango_label_encoders.pkl"):
    loaded_model = joblib.load("mango_model.pkl")
    loaded_label_encoders = joblib.load("mango_label_encoders.pkl")
    print("✅ Model and LabelEncoders loaded successfully.")
else:
    print("Error: Saved model or encoder files not found.")
    exit()

# --- 8. Test with new user input ---
print("\n--- Testing with User Input ---")
user_input_data = {}

for col in X.columns:
    if col in cat_cols:
        le = loaded_label_encoders[col]
        options = le.classes_
        print(f"\nSelect a value for '{col}':")
        for i, option in enumerate(options):
            print(f"[{i}] {option}")
        while True:
            try:
                choice_index = int(input("Enter the number corresponding to your choice: "))
                if 0 <= choice_index < len(options):
                    value = options[choice_index]
                    break
                else:
                    print("Invalid choice. Please enter a valid number.")
            except ValueError:
                print("Invalid input. Please enter a number.")
    else:
        value = input(f"Enter value for '{col}': ")
    user_input_data[col] = [value]

# Convert input to DataFrame
user_df = pd.DataFrame(user_input_data)

# Encode categorical columns
for col in cat_cols:
    if col in user_df.columns:
        le = loaded_label_encoders[col]
        user_df[col] = le.transform(user_df[col])

# Ensure numeric format
user_df = user_df.apply(pd.to_numeric)

# Prediction
prediction = loaded_model.predict(user_df)

print("\nPrediction for your input:")
print(f"The predicted status is: {prediction[0]}")


In [5]:
pd.set_option('display.max_columns', None)

df

Unnamed: 0,appointment_id,customer_id,booking_date,booking_time,booking_datetime,created_at,service_type,staff_assigned,duration_mins,status,reschedule_count,lead_time_minutes,holiday_flag,weather,channel,price,rating,name,mobile,email,gender,age,city,joined_date,tags,primary_business,primary_service
0,APPT0000001,CUST000001,2025-05-22,11:00,2025-05-22T11:00:00,2025-05-13T22:34:00,Facial,Christopher Todd,90,completed,0,12266,0,Sunny,phone,1046.02,,Madison Sanders,7061110698,hayesjacqueline@hotmail.com,Female,63,North Zachary,2023-03-10,price-sensitive,Salon,Pedicure
1,APPT0000002,CUST000001,2024-08-01,09:30,2024-08-01T09:30:00,2024-07-04T11:10:00,Facial,Christina Santos,60,completed,0,40220,0,Rainy,walk-in,3787.70,,Madison Sanders,7061110698,hayesjacqueline@hotmail.com,Female,63,North Zachary,2023-03-10,price-sensitive,Salon,Pedicure
2,APPT0000003,CUST000001,2024-07-31,17:45,2024-07-31T17:45:00,2024-07-31T01:02:00,Manicure,Christina Porter,45,completed,0,1003,0,Sunny,online,922.17,,Madison Sanders,7061110698,hayesjacqueline@hotmail.com,Female,63,North Zachary,2023-03-10,price-sensitive,Salon,Pedicure
3,APPT0000004,CUST000001,2025-08-09,08:45,2025-08-09T08:45:00,2025-07-21T06:16:00,Facial,Debbie Lewis,15,completed,0,27509,1,Cloudy,online,1989.06,3.0,Madison Sanders,7061110698,hayesjacqueline@hotmail.com,Female,63,North Zachary,2023-03-10,price-sensitive,Salon,Pedicure
4,APPT0000005,CUST000002,2025-05-25,12:15,2025-05-25T12:15:00,2025-05-03T20:12:00,Engine Repair,Daniel Hughes,15,completed,0,31203,0,Cloudy,phone,4522.44,,Robert Johnson,9794380244,courtney49@gomez.org,Male,22,South Toddstad,2021-08-15,new,Auto Repair,Bike Service
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,APPT0011996,CUST002099,2023-11-05,10:30,2023-11-05T10:30:00,2023-10-20T07:13:00,Legal,Patricia Hughes,60,completed,0,23237,0,Rainy,phone,4956.64,5.0,James Sawyer,7955367736,millerholly@alexander.com,Female,20,New Richard,2022-05-02,new,Consultant,Legal
11996,APPT0011997,CUST000816,2024-01-05,15:45,2024-01-05T15:45:00,2023-12-31T19:13:00,Tax,Natalie Johnson,15,cancelled,0,6992,0,Cloudy,phone,4701.08,,Sean Buchanan,9629155066,ashley43@smith-page.com,Male,71,North Erictown,2023-03-23,loyal,Consultant,Career
11997,APPT0011998,CUST001332,2025-01-28,08:30,2025-01-28T08:30:00,2025-01-01T14:57:00,Facial,Patricia Harvey,60,completed,0,38493,0,Rainy,online,2039.91,4.0,Stephanie Morris,7013057603,valdezjohnny@gmail.com,Male,75,East Jonathanhaven,2023-11-15,price-sensitive,Salon,Manicure
11998,APPT0011999,CUST001209,2023-09-11,11:00,2023-09-11T11:00:00,2023-09-07T19:24:00,AC Repair,Sheila Edwards,60,completed,0,5256,0,Rainy,phone,765.32,1.0,Bradley Black,8774010592,fredtaylor@mcpherson.com,Male,38,Stewarttown,2021-11-01,loyal,Appliance Repair,Microwave Repair


In [8]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns



# --- Training the Model ---

# 1. Separate features (X) and target variable (y)
# Drop the original 'status' column and other non-numerical columns not used as features
columns_to_drop = ['appointment_id', 'customer_id', 'booking_date', 'booking_time',
                   'booking_datetime', 'created_at', 'staff_assigned', 'name', 
                   'mobile', 'email', 'city', 'joined_date', 'primary_business', 
                   'primary_service', 'status', 'gender', 'rating']

# Ensure the columns exist before dropping them
columns_to_drop = [col for col in columns_to_drop if col in raw_data.columns]
X = raw_data.drop(columns_to_drop + ['status_enc'], axis=1)
y = raw_data['status_enc']

# Ensure all columns in X are numeric
X = X.select_dtypes(include=[np.number])

print("--- Original Class Distribution ---")
print(y.value_counts())

# 2. Handle class imbalance using SMOTE
from imblearn.over_sampling import SMOTE

# Check if there's more than one class for SMOTE to work
if len(y.unique()) > 1:
    sm = SMOTE(random_state=101)
    X_res, y_res = sm.fit_resample(X, y)
    print("\n--- Resampled Class Distribution ---")
    print(Counter(y_res))
else:
    print("\nWarning: Only one class found in the target variable. SMOTE cannot be applied.")
    X_res, y_res = X, y

# 3. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.25, random_state=101)

# 4. Train a machine learning model (Random Forest Classifier)
clf = RandomForestClassifier(n_estimators=200, random_state=101, n_jobs=-1)

print("\n--- Training the model ---")
clf.fit(X_train, y_train)
print("Model training complete!")

# 5. Evaluate the model's performance on the test set
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy Score: {accuracy:.4f}")

print("\n--- Confusion Matrix ---")
print(confusion_matrix(y_test, y_pred))

print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred))

NameError: name 'raw_data' is not defined

In [7]:
pip install imblearn

Collecting imblearnNote: you may need to restart the kernel to use updated packages.

  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn, imblearn

   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   ---------------------------------------- 0/2 [imbalanced-learn]
   -------------------- ------------------- 1/2 [imblearn]
   ---------------------------------------- 2/2 [imblearn]

Successfully installed imbalanced-learn-0.12.4 imblearn-0.0
