<a href="https://colab.research.google.com/github/Shivanshu04/Grouphousing_floor_pred/blob/main/4th_floor_convert_to_pmml_file_type.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install sklearn2pmml


Collecting sklearn2pmml
  Downloading sklearn2pmml-0.98.1.tar.gz (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill>=0.3.4 (from sklearn2pmml)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sklearn2pmml
  Building wheel for sklearn2pmml (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn2pmml: filename=sklearn2pmml-0.98.1-py3-none-any.whl size=7051306 sha256=c22f22503473cf0b08aa8f69aee479fe262d0e5ae5a2430cb225013b6b5a3100
  Stored in directory: /root/.cache/pip/wheels/7c/44/b2/48ffeecd45f409ea55fb0c10fa56023efb8432cb9deb679a82
Successfully built sklearn2pmml
Installing collected packages: dill, sklearn2pmml
Successfully installed dill-0.3.7 sklearn2pmml-0.98.1


In [None]:
import pandas as pd
from sklearn2pmml import sklearn2pmml, PMMLPipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
import numpy as np

# Define the file path to the dataset
file_path = '/content/4th_floor_with_estimated_durations_final_output.csv'

# Re-loading the dataset
data = pd.read_csv(file_path)

# Step 2: Data Preprocessing
# Converting date columns to datetime objects
data['actual_commencement_date'] = pd.to_datetime(data['actual_commencement_date'], errors='coerce')
data['estimated_finish_date'] = pd.to_datetime(data['estimated_finish_date'], errors='coerce')

# Creating new features based on the date columns
current_date = datetime.now()
data['duration_until_estimated_finish'] = (data['estimated_finish_date'] - data['actual_commencement_date']).dt.days
data['duration_since_commencement'] = (current_date - data['actual_commencement_date']).dt.days
data['remaining_duration'] = (data['estimated_finish_date'] - current_date).dt.days
data['progress_ratio'] = data['duration_since_commencement'] / data['duration_until_estimated_finish']

# Handling missing values in the 'current_stage' column
data.loc[(data['current_stage'].isna()) & (data['Project_status'] == 'Completed'), 'current_stage'] = 'Handover'

# Step 3: Feature Engineering
# Creating new features
data['year_of_commencement'] = data['actual_commencement_date'].dt.year
data['month_of_commencement'] = data['actual_commencement_date'].dt.month
data['year_of_estimated_finish'] = data['estimated_finish_date'].dt.year
data['month_of_estimated_finish'] = data['estimated_finish_date'].dt.month
data['days_exceeding_estimated_duration'] = data['duration_since_commencement'] - data['duration_until_estimated_finish']
data['is_delayed'] = (data['remaining_duration'] < 0).astype(int)

# Handling other missing values with appropriate strategies
data.fillna(data.mean(numeric_only=True), inplace=True)

# Performing label encoding on the 'current_stage' column
label_encoder = LabelEncoder()
data['current_stage'] = data['current_stage'].astype(str) # Converting to string to handle any NaN values left
data['current_stage_encoded'] = label_encoder.fit_transform(data['current_stage'])

# Step 4: Data Splitting
# Selecting relevant features for the model
feature_columns = [
    'duration_until_estimated_finish', 'duration_since_commencement', 'remaining_duration',
    'progress_ratio', 'year_of_commencement', 'month_of_commencement',
    'year_of_estimated_finish', 'month_of_estimated_finish',
    'days_exceeding_estimated_duration', 'is_delayed'
]

# Defining the feature set and the target variable
X = data[feature_columns]
y = data['current_stage_encoded']

# Splitting the data into training and testing sets (80% training and 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Displaying a message to indicate the preprocessing steps are completed
"Data preprocessing and splitting completed successfully."


'Data preprocessing and splitting completed successfully.'

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Initializing and training the Gradient Boosting Classifier
gbm_model = GradientBoostingClassifier(random_state=42)
gbm_model.fit(X_train, y_train)

# Displaying a message to indicate that the model has been trained


In [None]:
from sklearn.metrics import classification_report, accuracy_score

# Predicting the current stage on the testing set
y_pred = gbm_model.predict(X_test)

# Calculating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# Getting the classification report
class_report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

accuracy, class_report

ValueError: ignored

In [None]:
# Getting the unique labels present in the test set to avoid mismatch issue
unique_labels = np.unique(np.concatenate((y_test, y_pred)))

# Getting the classification report with the correct labels
class_report = classification_report(y_test, y_pred, labels=unique_labels, target_names=label_encoder.classes_[unique_labels])

accuracy, class_report
# Create a PMMLPipeline with the trained model
pipeline = PMMLPipeline([("classifier", gbm_model)])
# Export the pipeline to a PMML file
sklearn2pmml(pipeline, "gbm_model.pmml", with_repr=True)

In [None]:
# Getting the unique stage names in the main dataset and the materials data file
import pandas as pd

materials_data = pd.read_csv("/content/Copy of stage_with materai.csv")
unique_stages_main_dataset = data['current_stage'].unique()
unique_stages_materials_data = materials_data['Activity'].unique()

unique_stages_main_dataset, unique_stages_materials_data


(array(['Handover',
        'Plumbing & Sanitary,Electrification Works_estimated_duration',
        'Plastering on outer sides_estimated_duration',
        'Painting and Finishing_estimated_duration',
        '3rd Floor slab casting_estimated_duration',
        'Electrical concealed, PVC Fitting, plastering at 1st-4th floor_estimated_duration',
        'nan', 'Brick work at 1st Floor _estimated_duration',
        '3rd floor Columns casting _estimated_duration',
        'Tiles work_estimated_duration',
        'Electrical concealed, PVC Fitting, plastering at ground floor_estimated_duration',
        '4th Floor slab casting_estimated_duration',
        'Brick work of 2nd to 4th Floor _estimated_duration',
        'Doors & Windows Fixing Furniture work_estimated_duration',
        'Ground Floor slab casting _estimated_duration',
        'Cleaning & survey_estimated_duration',
        '1st Floor slab casting_estimated_duration',
        'Excavation,leveling & P.C.C  for Basement  B1 _esti

In [None]:
# Removing the "_estimated_duration" suffix from the stage names in the main dataset
data['current_stage_cleaned'] = data['current_stage'].str.replace('_estimated_duration', '')

# Updating the label encoder to use the cleaned stage names
label_encoder = LabelEncoder()
data['current_stage_encoded'] = label_encoder.fit_transform(data['current_stage_cleaned'].astype(str))

# Creating a dictionary to map the encoded labels to the cleaned stage names
label_to_stage_mapping = dict(zip(range(len(label_encoder.classes_)), label_encoder.classes_))

# Displaying the cleaned unique stage names
cleaned_unique_stages_main_dataset = data['current_stage_cleaned'].unique()

In [None]:
from pandas.tseries.offsets import MonthEnd

# Updating the prediction function to provide predictions for the current and upcoming stages for each month

# Defining the prediction function
def predict_current_stage(inputs):
    """
    Function to predict the current and upcoming stages for each month and recommend materials for each stage.

    Args:
    inputs (dict): Dictionary containing the necessary inputs (start date, end date).

    Returns:
    list: List of dictionaries containing the predictions for each month.
    """
    # Creating a data frame from the inputs
    input_data = pd.DataFrame([inputs])

    # Converting date columns to datetime objects
    input_data['actual_commencement_date'] = pd.to_datetime(input_data['actual_commencement_date'])
    input_data['estimated_finish_date'] = pd.to_datetime(input_data['estimated_finish_date'])

    # Creating a list to store the predictions for each month
    monthly_predictions = []

    # Looping over a range of dates from the current date to the estimated finish date, with a step size of one month
    current_date = pd.to_datetime("today")
    while current_date <= input_data['estimated_finish_date'].iloc[0]:
        # Creating new features using the current date in the loop
        input_data['duration_until_estimated_finish'] = (input_data['estimated_finish_date'] - input_data['actual_commencement_date']).dt.days
        input_data['duration_since_commencement'] = (current_date - input_data['actual_commencement_date']).dt.days
        input_data['remaining_duration'] = (input_data['estimated_finish_date'] - current_date).dt.days
        input_data['progress_ratio'] = input_data['duration_since_commencement'] / input_data['duration_until_estimated_finish']
        input_data['year_of_commencement'] = input_data['actual_commencement_date'].dt.year
        input_data['month_of_commencement'] = input_data['actual_commencement_date'].dt.month
        input_data['year_of_estimated_finish'] = input_data['estimated_finish_date'].dt.year
        input_data['month_of_estimated_finish'] = input_data['estimated_finish_date'].dt.month
        input_data['days_exceeding_estimated_duration'] = input_data['duration_since_commencement'] - input_data['duration_until_estimated_finish']
        input_data['is_delayed'] = (input_data['remaining_duration'] < 0).astype(int)

        # Selecting the relevant features
        input_features = input_data[feature_columns]

        # Making the prediction using the trained model to get the probability of each stage
        predicted_probs = gbm_model.predict_proba(input_features)[0]

        # Getting the most likely stage and the recommended materials for the current date in the loop
        top_prediction_index = np.argmax(predicted_probs)
        top_stage = label_to_stage_mapping[top_prediction_index]
        top_probability = predicted_probs[top_prediction_index]
        recommended_materials = materials_data.loc[materials_data['Activity'].str.contains(top_stage, case=False, na=False), 'Materials (suggestions)']
        recommended_materials = recommended_materials.values[0] if not recommended_materials.empty else "No materials suggested"

        # Adding the prediction for the current date to the list of monthly predictions
        monthly_predictions.append({
            "Date": current_date.strftime('%Y-%m-%d'),
            "Stage": top_stage,
            "Probability": top_probability,
            "Recommended Materials": recommended_materials
        })

        # Moving to the next month
        current_date = current_date + MonthEnd(1)

    # Returning the results
    return monthly_predictions

# Testing the prediction function with a sample input
test_input = {
    "actual_commencement_date": "	2023-9-19",
    "estimated_finish_date": "2028-02-03",
}

predict_current_stage(test_input)
# Export the pipeline to a PMML file
sklearn2pmml(pipeline, "gbm_model.pmml", with_repr=True)


In [None]:
pip install sklearn2pmml



In [None]:

import pandas as pd
from sklearn2pmml import sklearn2pmml, PMMLPipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
import numpy as np

# Define the file path to the dataset
file_path = '/content/4th_floor_with_estimated_durations_final_output.csv'

# Re-loading the dataset
data = pd.read_csv(file_path)

# Step 2: Data Preprocessing
# Converting date columns to datetime objects
data['actual_commencement_date'] = pd.to_datetime(data['actual_commencement_date'], errors='coerce')
data['estimated_finish_date'] = pd.to_datetime(data['estimated_finish_date'], errors='coerce')

# Creating new features based on the date columns
current_date = datetime.now()
data['duration_until_estimated_finish'] = (data['estimated_finish_date'] - data['actual_commencement_date']).dt.days
data['duration_since_commencement'] = (current_date - data['actual_commencement_date']).dt.days
data['remaining_duration'] = (data['estimated_finish_date'] - current_date).dt.days
data['progress_ratio'] = data['duration_since_commencement'] / data['duration_until_estimated_finish']

# Handling missing values in the 'current_stage' column
data.loc[(data['current_stage'].isna()) & (data['Project_status'] == 'Completed'), 'current_stage'] = 'Handover'

# Step 3: Feature Engineering
# Creating new features
data['year_of_commencement'] = data['actual_commencement_date'].dt.year
data['month_of_commencement'] = data['actual_commencement_date'].dt.month
data['year_of_estimated_finish'] = data['estimated_finish_date'].dt.year
data['month_of_estimated_finish'] = data['estimated_finish_date'].dt.month
data['days_exceeding_estimated_duration'] = data['duration_since_commencement'] - data['duration_until_estimated_finish']
data['is_delayed'] = (data['remaining_duration'] < 0).astype(int)

# Handling other missing values with appropriate strategies
data.fillna(data.mean(numeric_only=True), inplace=True)

# Performing label encoding on the 'current_stage' column
label_encoder = LabelEncoder()
data['current_stage'] = data['current_stage'].astype(str) # Converting to string to handle any NaN values left
data['current_stage_encoded'] = label_encoder.fit_transform(data['current_stage'])

# Step 4: Data Splitting
# Selecting relevant features for the model
feature_columns = [
    'duration_until_estimated_finish', 'duration_since_commencement', 'remaining_duration',
    'progress_ratio', 'year_of_commencement', 'month_of_commencement',
    'year_of_estimated_finish', 'month_of_estimated_finish',
    'days_exceeding_estimated_duration', 'is_delayed'
]

# Defining the feature set and the target variable
X = data[feature_columns]
y = data['current_stage_encoded']

# Splitting the data into training and testing sets (80% training and 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Displaying a message to indicate the preprocessing steps are completed
"Data preprocessing and splitting completed successfully."

from sklearn.ensemble import GradientBoostingClassifier

# Initializing and training the Gradient Boosting Classifier
gbm_model = GradientBoostingClassifier(random_state=42)
gbm_model.fit(X_train, y_train)

# Displaying a message to indicate that the model has been trained

from sklearn.metrics import classification_report, accuracy_score

# Predicting the current stage on the testing set
y_pred = gbm_model.predict(X_test)

# Calculating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
# Getting the classification report with the correct labels
class_report = classification_report(y_test, y_pred, labels=unique_labels, target_names=label_encoder.classes_[unique_labels])

accuracy, class_report


# Getting the classification report with the correct labels
class_report = classification_report(y_test, y_pred, labels=unique_labels, target_names=label_encoder.classes_[unique_labels])

accuracy, class_report
# Create a PMMLPipeline with the trained model
pipeline = PMMLPipeline([("classifier", gbm_model)])
# Export the pipeline to a PMML file
sklearn2pmml(pipeline, "gbm_model.pmml", with_repr=True)

# Getting the unique stage names in the main dataset and the materials data file
import pandas as pd

materials_data = pd.read_csv("/content/Copy of stage_with materai.csv")
unique_stages_main_dataset = data['current_stage'].unique()
unique_stages_materials_data = materials_data['Activity'].unique()

unique_stages_main_dataset, unique_stages_materials_data
# Removing the "_estimated_duration" suffix from the stage names in the main dataset
data['current_stage_cleaned'] = data['current_stage'].str.replace('_estimated_duration', '')

# Updating the label encoder to use the cleaned stage names
label_encoder = LabelEncoder()
data['current_stage_encoded'] = label_encoder.fit_transform(data['current_stage_cleaned'].astype(str))

# Creating a dictionary to map the encoded labels to the cleaned stage names
label_to_stage_mapping = dict(zip(range(len(label_encoder.classes_)), label_encoder.classes_))

# Displaying the cleaned unique stage names
cleaned_unique_stages_main_dataset = data['current_stage_cleaned'].unique()
from pandas.tseries.offsets import MonthEnd

# Updating the prediction function to provide predictions for the current and upcoming stages for each month

# Defining the prediction function
def predict_current_stage(inputs):
    """
    Function to predict the current and upcoming stages for each month and recommend materials for each stage.

    Args:
    inputs (dict): Dictionary containing the necessary inputs (start date, end date).

    Returns:
    list: List of dictionaries containing the predictions for each month.
    """
    # Creating a data frame from the inputs
    input_data = pd.DataFrame([inputs])

    # Converting date columns to datetime objects
    input_data['actual_commencement_date'] = pd.to_datetime(input_data['actual_commencement_date'])
    input_data['estimated_finish_date'] = pd.to_datetime(input_data['estimated_finish_date'])

    # Creating a list to store the predictions for each month
    monthly_predictions = []

    # Looping over a range of dates from the current date to the estimated finish date, with a step size of one month
    current_date = pd.to_datetime("today")
    while current_date <= input_data['estimated_finish_date'].iloc[0]:
        # Creating new features using the current date in the loop
        input_data['duration_until_estimated_finish'] = (input_data['estimated_finish_date'] - input_data['actual_commencement_date']).dt.days
        input_data['duration_since_commencement'] = (current_date - input_data['actual_commencement_date']).dt.days
        input_data['remaining_duration'] = (input_data['estimated_finish_date'] - current_date).dt.days
        input_data['progress_ratio'] = input_data['duration_since_commencement'] / input_data['duration_until_estimated_finish']
        input_data['year_of_commencement'] = input_data['actual_commencement_date'].dt.year
        input_data['month_of_commencement'] = input_data['actual_commencement_date'].dt.month
        input_data['year_of_estimated_finish'] = input_data['estimated_finish_date'].dt.year
        input_data['month_of_estimated_finish'] = input_data['estimated_finish_date'].dt.month
        input_data['days_exceeding_estimated_duration'] = input_data['duration_since_commencement'] - input_data['duration_until_estimated_finish']
        input_data['is_delayed'] = (input_data['remaining_duration'] < 0).astype(int)

        # Selecting the relevant features
        input_features = input_data[feature_columns]

        # Making the prediction using the trained model to get the probability of each stage
        predicted_probs = gbm_model.predict_proba(input_features)[0]

        # Getting the most likely stage and the recommended materials for the current date in the loop
        top_prediction_index = np.argmax(predicted_probs)
        top_stage = label_to_stage_mapping[top_prediction_index]
        top_probability = predicted_probs[top_prediction_index]
        recommended_materials = materials_data.loc[materials_data['Activity'].str.contains(top_stage, case=False, na=False), 'Materials (suggestions)']
        recommended_materials = recommended_materials.values[0] if not recommended_materials.empty else "No materials suggested"

        # Adding the prediction for the current date to the list of monthly predictions
        monthly_predictions.append({
            "Date": current_date.strftime('%Y-%m-%d'),
            "Stage": top_stage,
            "Probability": top_probability,
            "Recommended Materials": recommended_materials
        })

        # Moving to the next month
        current_date = current_date + MonthEnd(1)

    # Returning the results
    return monthly_predictions

# Testing the prediction function with a sample input
test_input = {
    "actual_commencement_date": "	2023-9-19",
    "estimated_finish_date": "2028-02-03",
}

predict_current_stage(test_input)
# Export the pipeline to a PMML file
sklearn2pmml(pipeline, "gbm_model.pmml", with_repr=True)
