In [None]:
# Generate Dataset
import pandas as pd
import random

# Function to simulate the service cost based on service type, vehicle type, parts replaced, and labor hours
def calculate_service_cost(vehicle_type, service_type, parts_replaced, labor_hours):
    base_cost = 0
    
    # Define base costs for different service types
    if service_type == "Regular Maintenance":
        if vehicle_type == "Car":
            base_cost = 1000
        else:  # Bike
            base_cost = 800
    elif service_type == "Engine Repair":
        if vehicle_type == "Car":
            base_cost = 3000
        else:  # Bike
            base_cost = 1500
    elif service_type == "Tyre Replacement":
        if vehicle_type == "Car":
            base_cost = 10000
        else:  # Bike
            base_cost = 3000
    
    # Add the cost of parts replaced (simplified as a fixed amount per part)
    parts_cost = parts_replaced * 500
    
    # Add labor cost based on hours (simplified as 300 per hour)
    labor_cost = labor_hours * 300
    
    # Total service cost
    total_cost = base_cost + parts_cost + labor_cost
    return total_cost

# Generate synthetic dataset
data = []
vehicle_types = ['Car', 'Bike']
service_types = ['Regular Maintainance', 'Engine Repair', 'Tyre Replacement']

for _ in range(30000):  # Generating 10,000 records for a huge dataset
    vehicle_type = random.choice(vehicle_types)
    vehicle_age = random.randint(1, 10)
    odometer_reading = random.randint(vehicle_age * 5000, vehicle_age * 10000)  # Correlating odometer with age
    service_type = random.choice(service_types)
    parts_replaced = random.randint(0, 5)  # Number of parts replaced
    labor_hours = random.uniform(1, 4)  # Labor hours can vary
    service_cost = calculate_service_cost(vehicle_type, service_type, parts_replaced, labor_hours)
    
    data.append([vehicle_type, vehicle_age, odometer_reading, service_type, parts_replaced, labor_hours, service_cost])

# Create a DataFrame
df_synthetic = pd.DataFrame(data, columns=['vehicle_type', 'vehicle_age', 'odometer_reading', 'service_type', 'parts_replaced', 'labor_hours', 'service_cost'])

# Save to CSV file
df_synthetic.to_csv('synthetic_service_data_large.csv', index=False)

# Display the first few rows of the synthetic data
print(df_synthetic.head())


In [None]:
# Linear Regression  --> Not a good model
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib

# Load dataset
df = pd.read_csv('service_data.csv')

# Preprocessing: Encode categorical variables
label_encoder_vehicle_type = LabelEncoder()
label_encoder_service_type = LabelEncoder()

# Encode categorical features
df['vehicle_type'] = label_encoder_vehicle_type.fit_transform(df['vehicle_type'])
df['service_type'] = label_encoder_service_type.fit_transform(df['service_type'])

# Features (X) and target (y)
X = df[['vehicle_type', 'vehicle_age', 'odometer_reading', 'service_type', 'parts_replaced', 'labor_hours']]
y = df['service_cost']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Model Evaluation
mse_rf = mean_squared_error(y_test, y_pred)
r2_rf = r2_score(y_test, y_pred)
rmse_rf = mse_rf ** 0.5  # Root Mean Squared Error (RMSE)

# Calculate Mean Absolute Percentage Error (MAPE)
mape = (abs(y_test - y_pred) / y_test).mean() * 100

# Calculate "Accuracy" as 100 - MAPE
accuracy = 100 - mape

# Print evaluation metrics
print(f'Mean Squared Error (Random Forest): {mse_rf}')
print(f'R² (Coefficient of Determination) for Random Forest: {r2_rf}')
print(f'Root Mean Squared Error (RMSE) for Random Forest: {rmse_rf}')
print(f'MAPE (Mean Absolute Percentage Error): {mape:.2f}%')
print(f'Model Accuracy: {accuracy:.2f}%')

# Save the trained model and encoders to files
joblib.dump(model, 'service_cost_predictor.pkl')
joblib.dump(label_encoder_vehicle_type, 'label_encoder_vehicle_type.pkl')
joblib.dump(label_encoder_service_type, 'label_encoder_service_type.pkl')

# Function to predict service cost
def predict_service_cost(vehicle_type, vehicle_age, odometer_reading, service_type, parts_replaced, labor_hours):
    # Load the encoders
    label_encoder_vehicle_type = joblib.load('label_encoder_vehicle_type.pkl')
    label_encoder_service_type = joblib.load('label_encoder_service_type.pkl')
    
    # Transform the categorical inputs
    vehicle_type_encoded = label_encoder_vehicle_type.transform([vehicle_type])[0]
    service_type_encoded = label_encoder_service_type.transform([service_type])[0]
    
    # Prepare input data for prediction
    input_data = pd.DataFrame([[vehicle_type_encoded, vehicle_age, odometer_reading, service_type_encoded, parts_replaced, labor_hours]],
                              columns=['vehicle_type', 'vehicle_age', 'odometer_reading', 'service_type', 'parts_replaced', 'labor_hours'])

    # Load the trained model
    model = joblib.load('service_cost_predictor.pkl')

    # Predict service cost using the trained model
    predicted_cost = model.predict(input_data)
    return predicted_cost[0]

# Example usage: 
vehicle_type = 'Bike'  # For 'Bike'
service_type = 'Regular Maintenance'  # For 'Regular Maintenance'

# Prediction example
cost = predict_service_cost(vehicle_type, 10, 80000, service_type, 3, 4)
print(f"Predicted Service Cost: ₹{cost:.2f}")


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Load dataset
df = pd.read_csv('service_data.csv')

# Preprocessing: Encode categorical variables
label_encoder_vehicle_type = LabelEncoder()
label_encoder_service_type = LabelEncoder()

# Encode categorical features
df['vehicle_type'] = label_encoder_vehicle_type.fit_transform(df['vehicle_type'])
df['service_type'] = label_encoder_service_type.fit_transform(df['service_type'])

# Features (X) and target (y)
X = df[['vehicle_type', 'vehicle_age', 'odometer_reading', 'service_type', 'parts_replaced', 'labor_hours']]
y = df['service_cost']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on test data
y_pred = rf_model.predict(X_test)

# Model Evaluation
mse_rf = mean_squared_error(y_test, y_pred)
r2_rf = r2_score(y_test, y_pred)
rmse_rf = mse_rf ** 0.5  # Root Mean Squared Error (RMSE)

# Calculate Mean Absolute Percentage Error (MAPE)
mape = (abs(y_test - y_pred) / y_test).mean() * 100

# Calculate "Accuracy" as 100 - MAPE
accuracy = 100 - mape

# Print evaluation metrics
print(f'Mean Squared Error (Random Forest): {mse_rf}')
# print(f'R² (Coefficient of Determination) for Random Forest: {r2_rf}')
# print(f'Root Mean Squared Error (RMSE) for Random Forest: {rmse_rf}')
# print(f'MAPE (Mean Absolute Percentage Error): {mape:.2f}%')
print(f'Model Accuracy: {accuracy:.2f}%')

# Save the trained model and encoders to files
joblib.dump(rf_model, 'service_cost_rf_predictor.pkl')
joblib.dump(label_encoder_vehicle_type, 'label_encoder_vehicle_type.pkl')
joblib.dump(label_encoder_service_type, 'label_encoder_service_type.pkl')

# Function to predict service cost using Random Forest model
def predict_service_cost_rf(vehicle_type, vehicle_age, odometer_reading, service_type, parts_replaced, labor_hours):
    # Load the encoders
    label_encoder_vehicle_type = joblib.load('label_encoder_vehicle_type.pkl')
    label_encoder_service_type = joblib.load('label_encoder_service_type.pkl')
    
    # Transform the categorical inputs
    vehicle_type_encoded = label_encoder_vehicle_type.transform([vehicle_type])[0]
    service_type_encoded = label_encoder_service_type.transform([service_type])[0]
    
    # Prepare input data for prediction
    input_data = pd.DataFrame([[vehicle_type_encoded, vehicle_age, odometer_reading, service_type_encoded, parts_replaced, labor_hours]],
                              columns=['vehicle_type', 'vehicle_age', 'odometer_reading', 'service_type', 'parts_replaced', 'labor_hours'])

    # Load the trained Random Forest model
    rf_model = joblib.load('service_cost_rf_predictor.pkl')

    # Predict service cost using the trained model
    predicted_cost = rf_model.predict(input_data)
    return predicted_cost[0]

# Example usage:
vehicle_type = 'Bike' 
service_type = 'Tyre Replacement'  

# Prediction example
cost = predict_service_cost_rf(vehicle_type, 6, 24000, service_type, 2, 2)
print(f"Predicted Service Cost (Random Forest): ₹{cost:.2f}")


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Load dataset
df = pd.read_csv('service_data.csv')

# Preprocessing: Encode categorical variables
label_encoder_vehicle_type = LabelEncoder()
label_encoder_service_type = LabelEncoder()

# Encode categorical features
df['vehicle_type'] = label_encoder_vehicle_type.fit_transform(df['vehicle_type'])
df['service_type'] = label_encoder_service_type.fit_transform(df['service_type'])

# Features (X) and target (y)
X = df[['vehicle_type', 'vehicle_age', 'odometer_reading', 'service_type', 'parts_replaced', 'labor_hours']]
y = df['service_cost']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train Gradient Boosting model
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)

# Predict on test data
y_pred = gb_model.predict(X_test)

# Model Evaluation
mse_gb = mean_squared_error(y_test, y_pred)
r2_gb = r2_score(y_test, y_pred)
rmse_gb = mse_gb ** 0.5  # Root Mean Squared Error (RMSE)

# Calculate Mean Absolute Percentage Error (MAPE)
mape = (abs(y_test - y_pred) / y_test).mean() * 100

# Calculate "Accuracy" as 100 - MAPE
accuracy = 100 - mape

# Print evaluation metrics
print(f'Mean Squared Error (Gradient Boosting): {mse_gb}')
# print(f'R² (Coefficient of Determination) for Gradient Boosting: {r2_gb}')
# print(f'Root Mean Squared Error (RMSE) for Gradient Boosting: {rmse_gb}')
# print(f'MAPE (Mean Absolute Percentage Error): {mape:.2f}%')
print(f'Model Accuracy: {accuracy:.2f}%')

# Save the trained model and encoders to files
joblib.dump(gb_model, 'service_cost_gb_predictor.pkl')
joblib.dump(label_encoder_vehicle_type, 'label_encoder_vehicle_type.pkl')
joblib.dump(label_encoder_service_type, 'label_encoder_service_type.pkl')

# Function to predict service cost using Gradient Boosting model
def predict_service_cost_gb(vehicle_type, vehicle_age, odometer_reading, service_type, parts_replaced, labor_hours):
    # Load the encoders
    label_encoder_vehicle_type = joblib.load('label_encoder_vehicle_type.pkl')
    label_encoder_service_type = joblib.load('label_encoder_service_type.pkl')
    
    # Transform the categorical inputs
    vehicle_type_encoded = label_encoder_vehicle_type.transform([vehicle_type])[0]
    service_type_encoded = label_encoder_service_type.transform([service_type])[0]
    
    # Prepare input data for prediction
    input_data = pd.DataFrame([[vehicle_type_encoded, vehicle_age, odometer_reading, service_type_encoded, parts_replaced, labor_hours]],
                              columns=['vehicle_type', 'vehicle_age', 'odometer_reading', 'service_type', 'parts_replaced', 'labor_hours'])

    # Load the trained Gradient Boosting model
    gb_model = joblib.load('service_cost_gb_predictor.pkl')

    # Predict service cost using the trained model
    predicted_cost = gb_model.predict(input_data)
    return predicted_cost[0]

# Example usage:
vehicle_type = 'Car'  
service_type = 'Tyre Replacement'  
# Prediction example
cost = predict_service_cost_gb(vehicle_type, 9, 170000, service_type, 4, 2)
print(f"Predicted Service Cost (Gradient Boosting): ₹{cost:.2f}")
