In [10]:
import sagemaker
import boto3
from sagemaker.session import s3_input, Session

In [11]:


def create_s3_bucket(bucket_name, region="ap-south-1"):
    s3 = boto3.client('s3', region_name=region)
    
    try:
        s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': region})
        print(f"Bucket {bucket_name} created successfully in {region} region!")
    except Exception as e:
        print(f"Error creating bucket: {e}")

# Usage:
bucket_name = "demomlbucket202"  # Bucket names must be globally unique
create_s3_bucket(bucket_name)


Error creating bucket: An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


In [21]:
# Function to upload a file to the specified S3 bucket
def upload_file_to_s3(bucket_name, local_file_path, s3_file_key):
    s3 = boto3.client('s3')
    
    try:
        s3.upload_file(local_file_path, bucket_name, s3_file_key)
        print(f"File {local_file_path} uploaded successfully to {bucket_name}/{s3_file_key}!")
    except Exception as e:
        print(f"Error uploading file to S3: {e}")

# Upload a file to the created bucket
local_file_path = "4th_floor_with_estimated_durations_final_output.csv"
s3_file_key = "desired/data.csv"
upload_file_to_s3(bucket_name, local_file_path, s3_file_key)


File 4th_floor_with_estimated_durations_final_output.csv uploaded successfully to demomlbucket202/desired/data.csv!


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from datetime import datetime
import numpy as np

def load_data(file_path):
    try:
        return pd.read_csv(file_path)
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

def preprocess_data(data):
    try:
        # Convert date columns to datetime objects
        data['actual_commencement_date'] = pd.to_datetime(data['actual_commencement_date'], errors='coerce')
        data['estimated_finish_date'] = pd.to_datetime(data['estimated_finish_date'], errors='coerce')

        # Create new date-based features
        current_date = datetime.now()
        data['duration_until_estimated_finish'] = (data['estimated_finish_date'] - data['actual_commencement_date']).dt.days
        data['duration_since_commencement'] = (current_date - data['actual_commencement_date']).dt.days
        data['remaining_duration'] = (data['estimated_finish_date'] - current_date).dt.days
        data['progress_ratio'] = data['duration_since_commencement'] / data['duration_until_estimated_finish']

        # Handle missing values
        data.loc[(data['current_stage'].isna()) & (data['Project_status'] == 'Completed'), 'current_stage'] = 'Handover'
        
        # Further feature engineering
        data['year_of_commencement'] = data['actual_commencement_date'].dt.year
        data['month_of_commencement'] = data['actual_commencement_date'].dt.month
        data['year_of_estimated_finish'] = data['estimated_finish_date'].dt.year
        data['month_of_estimated_finish'] = data['estimated_finish_date'].dt.month
        data['days_exceeding_estimated_duration'] = data['duration_since_commencement'] - data['duration_until_estimated_finish']
        data['is_delayed'] = (data['remaining_duration'] < 0).astype(int)

        # Handle other missing values
        data.fillna(data.mean(numeric_only=True), inplace=True)

        return data
    except Exception as e:
        print(f"Error in preprocessing: {e}")
        return None

def encode_data(data):
    try:
        # Label Encoding
        label_encoder = LabelEncoder()
        data['current_stage'] = data['current_stage'].astype(str) # To handle any NaN values left
        data['current_stage_encoded'] = label_encoder.fit_transform(data['current_stage'])

        # OneHot Encoding (Optional based on the number of unique values in 'current_stage')
        if len(data['current_stage'].unique()) > 10: # This threshold can be adjusted
            onehot_encoder = OneHotEncoder()
            encoded_features = onehot_encoder.fit_transform(data[['current_stage']])
            data = pd.concat([data, pd.DataFrame(encoded_features.toarray(), columns=onehot_encoder.get_feature_names(['current_stage']))], axis=1)

        return data, label_encoder
    except Exception as e:
        print(f"Error in encoding: {e}")
        return None, None

def split_data(data):
    # Define relevant features for the model
    feature_columns = [
        'duration_until_estimated_finish', 'duration_since_commencement', 'remaining_duration',
        'progress_ratio', 'year_of_commencement', 'month_of_commencement',
        'year_of_estimated_finish', 'month_of_estimated_finish',
        'days_exceeding_estimated_duration', 'is_delayed'
    ]
    X = data[feature_columns]
    y = data['current_stage_encoded']

    # Split the data
    return train_test_split(X, y, test_size=0.2, random_state=42)

# Usage:

file_path = '4th_floor_with_estimated_durations_final_output.csv'
data = load_data(file_path)
if data is not None:
    data = preprocess_data(data)
    if data is not None:
        data, label_encoder = encode_data(data)
        if data is not None and label_encoder is not None:
            X_train, X_test, y_train, y_test = split_data(data)
            print("Data processing completed!")


Data processing completed!




In [17]:
from sklearn.ensemble import GradientBoostingClassifier

# Initializing and training the Gradient Boosting Classifier
gbm_model = GradientBoostingClassifier(random_state=42)
gbm_model.fit(X_train, y_train)

# Displaying a message to indicate that the model has been trained
"Gradient Boosting Model has been trained successfully."

'Gradient Boosting Model has been trained successfully.'

In [24]:
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

def evaluate_model(model, X_test, y_test, label_encoder):
    try:
        # Predicting the current stage on the testing set
        y_pred = model.predict(X_test)
        
        # Calculating the accuracy of the model
        accuracy = accuracy_score(y_test, y_pred)
        
        # Getting the unique labels present in the test set and predictions
        unique_labels = np.unique(np.concatenate((y_test, y_pred)))
        
        # Getting the classification report with the correct labels
        class_report = classification_report(y_test, y_pred, labels=unique_labels, target_names=label_encoder.classes_[unique_labels])
        
        return accuracy, class_report
    except Exception as e:
        print(f"Error evaluating model: {e}")
        return None, None

# Usage:

accuracy, class_report = evaluate_model(gbm_model, X_test, y_test, label_encoder)

if accuracy is not None and class_report is not None:
    print("Model Accuracy:", accuracy)
    print("\nClassification Report:\n", class_report)


Model Accuracy: 0.950207468879668

Classification Report:
                                                                                              precision    recall  f1-score   support

                                                  1st Floor slab casting_estimated_duration       1.00      0.67      0.80         3
                                               1st floor Columns casting_estimated_duration       0.50      0.50      0.50         2
                                                  2nd Floor slab casting_estimated_duration       1.00      0.88      0.93         8
                                                  3rd Floor slab casting_estimated_duration       1.00      1.00      1.00         7
                                              3rd floor Columns casting _estimated_duration       0.50      1.00      0.67         1
                                                  4th Floor slab casting_estimated_duration       1.00      1.00      1.00         5
         

In [28]:
# Function to upload a file to the specified S3 bucket
def upload_file_to_s3(bucket_name, local_file_path, s3_file_key):
    s3 = boto3.client('s3')
    
    try:
        s3.upload_file(local_file_path, bucket_name, s3_file_key)
        print(f"File {local_file_path} uploaded successfully to {bucket_name}/{s3_file_key}!")
    except Exception as e:
        print(f"Error uploading file to S3: {e}")

# Upload a file to the created bucket
local_file_path = "Copy of stage_with materai.csv"
s3_file_key = "desired/stages.csv"
upload_file_to_s3(bucket_name, local_file_path, s3_file_key)

File Copy of stage_with materai.csv uploaded successfully to demomlbucket202/desired/stages.csv!


In [57]:
# Getting the unique stage names in the main dataset and the materials data file
import pandas as pd

materials_data = pd.read_csv("Copy of stage_with materai.csv")
unique_stages_main_dataset = data['current_stage'].unique()
unique_stages_materials_data = materials_data['Activity'].unique()

unique_stages_main_dataset, unique_stages_materials_data

(array(['Handover',
        'Plumbing & Sanitary,Electrification Works_estimated_duration',
        'Plastering on outer sides_estimated_duration',
        'Painting and Finishing_estimated_duration',
        '3rd Floor slab casting_estimated_duration',
        'Electrical concealed, PVC Fitting, plastering at 1st-4th floor_estimated_duration',
        'nan', 'Brick work at 1st Floor _estimated_duration',
        '3rd floor Columns casting _estimated_duration',
        'Tiles work_estimated_duration',
        'Electrical concealed, PVC Fitting, plastering at ground floor_estimated_duration',
        '4th Floor slab casting_estimated_duration',
        'Brick work of 2nd to 4th Floor _estimated_duration',
        'Doors & Windows Fixing Furniture work_estimated_duration',
        'Ground Floor slab casting _estimated_duration',
        'Cleaning & survey_estimated_duration',
        '1st Floor slab casting_estimated_duration',
        'Excavation,leveling & P.C.C  for Basement  B1 _esti

In [30]:
# Removing the "_estimated_duration" suffix from the stage names in the main dataset
data['current_stage_cleaned'] = data['current_stage'].str.replace('_estimated_duration', '')

# Updating the label encoder to use the cleaned stage names
label_encoder = LabelEncoder()
data['current_stage_encoded'] = label_encoder.fit_transform(data['current_stage_cleaned'].astype(str))

# Creating a dictionary to map the encoded labels to the cleaned stage names
label_to_stage_mapping = dict(zip(range(len(label_encoder.classes_)), label_encoder.classes_))

# Displaying the cleaned unique stage names
cleaned_unique_stages_main_dataset = data['current_stage_cleaned'].unique()
cleaned_unique_stages_main_dataset

array(['Handover', 'Plumbing & Sanitary,Electrification Works',
       'Plastering on outer sides', 'Painting and Finishing',
       '3rd Floor slab casting',
       'Electrical concealed, PVC Fitting, plastering at 1st-4th floor',
       'nan', 'Brick work at 1st Floor ', '3rd floor Columns casting ',
       'Tiles work',
       'Electrical concealed, PVC Fitting, plastering at ground floor',
       '4th Floor slab casting', 'Brick work of 2nd to 4th Floor ',
       'Doors & Windows Fixing Furniture work',
       'Ground Floor slab casting ', 'Cleaning & survey',
       '1st Floor slab casting',
       'Excavation,leveling & P.C.C  for Basement  B1 ',
       '4th floor Columns casting ',
       'Brick work at Basement to Ground Floor ', 'Slab of B (bottom) ',
       'Electrical concealed, PVC Fitting, plastering at Basement',
       '2nd floor Columns casting ', '2nd Floor slab casting',
       'Raft footing, Column B1, Retaining wall Reinforcement ,Concrete pouring ',
       '1st flo

In [32]:
# Removing the "_estimated_duration" suffix from the stage names in the main dataset
data['current_stage_cleaned'] = data['current_stage'].str.replace('_estimated_duration', '')

# Updating the label encoder to use the cleaned stage names
label_encoder = LabelEncoder()
data['current_stage_encoded'] = label_encoder.fit_transform(data['current_stage_cleaned'].astype(str))

# Creating a dictionary to map the encoded labels to the cleaned stage names
label_to_stage_mapping = dict(zip(range(len(label_encoder.classes_)), label_encoder.classes_))

# Displaying the cleaned unique stage names
cleaned_unique_stages_main_dataset = data['current_stage_cleaned'].unique()
cleaned_unique_stages_main_dataset

array(['Handover', 'Plumbing & Sanitary,Electrification Works',
       'Plastering on outer sides', 'Painting and Finishing',
       '3rd Floor slab casting',
       'Electrical concealed, PVC Fitting, plastering at 1st-4th floor',
       'nan', 'Brick work at 1st Floor ', '3rd floor Columns casting ',
       'Tiles work',
       'Electrical concealed, PVC Fitting, plastering at ground floor',
       '4th Floor slab casting', 'Brick work of 2nd to 4th Floor ',
       'Doors & Windows Fixing Furniture work',
       'Ground Floor slab casting ', 'Cleaning & survey',
       '1st Floor slab casting',
       'Excavation,leveling & P.C.C  for Basement  B1 ',
       '4th floor Columns casting ',
       'Brick work at Basement to Ground Floor ', 'Slab of B (bottom) ',
       'Electrical concealed, PVC Fitting, plastering at Basement',
       '2nd floor Columns casting ', '2nd Floor slab casting',
       'Raft footing, Column B1, Retaining wall Reinforcement ,Concrete pouring ',
       '1st flo

In [38]:
# Updating the prediction function to use the cleaned stage names

# Defining the prediction function
def predict_current_stage(inputs):
    """
    Function to predict the current stage of a project and recommend materials.

    Args:
    inputs (dict): Dictionary containing the necessary inputs (start date, end date).

    Returns:
    dict: Dictionary containing the predicted stage and recommended materials.
    """
    # Creating a data frame from the inputs
    input_data = pd.DataFrame([inputs])

    # Converting date columns to datetime objects and creating new features
    input_data['actual_commencement_date'] = pd.to_datetime(input_data['actual_commencement_date'])
    input_data['estimated_finish_date'] = pd.to_datetime(input_data['estimated_finish_date'])
    current_date = datetime.now()
    input_data['duration_until_estimated_finish'] = (input_data['estimated_finish_date'] - input_data['actual_commencement_date']).dt.days
    input_data['duration_since_commencement'] = (current_date - input_data['actual_commencement_date']).dt.days
    input_data['remaining_duration'] = (input_data['estimated_finish_date'] - current_date).dt.days
    input_data['progress_ratio'] = input_data['duration_since_commencement'] / input_data['duration_until_estimated_finish']
    input_data['year_of_commencement'] = input_data['actual_commencement_date'].dt.year
    input_data['month_of_commencement'] = input_data['actual_commencement_date'].dt.month
    input_data['year_of_estimated_finish'] = input_data['estimated_finish_date'].dt.year
    input_data['month_of_estimated_finish'] = input_data['estimated_finish_date'].dt.month
    input_data['days_exceeding_estimated_duration'] = input_data['duration_since_commencement'] - input_data['duration_until_estimated_finish']
    input_data['is_delayed'] = (input_data['remaining_duration'] < 0).astype(int)

    # Selecting the relevant features
    input_features = input_data[feature_columns]

    # Making the prediction using the trained model
    predicted_label = gbm_model.predict(input_features)[0]

    # Getting the predicted stage and the recommended materials
    predicted_stage = label_to_stage_mapping[predicted_label]
    recommended_materials = materials_data.loc[materials_data['Activity'].str.contains(predicted_stage, case=False, na=False), 'Materials (suggestions)'].values[0]

    # Returning the results
    return {
        "Predicted Stage": predicted_stage,
        "Recommended Materials": recommended_materials
    }

# Testing the prediction function with a sample input
test_input = {
    "actual_commencement_date": "2022-03-22",
    "estimated_finish_date": "2024-12-31",
}

In [40]:
import pandas as pd
from datetime import datetime
def predict_current_stage(inputs, feature_columns):
    try:
        # Creating a data frame from the inputs
        input_data = pd.DataFrame([inputs])

        # Check if necessary columns are present
        for column in ['actual_commencement_date', 'estimated_finish_date']:
            if column not in input_data.columns:
                raise ValueError(f"Input is missing the {column} column.")

        # Convert date columns to datetime objects and create new features
        input_data['actual_commencement_date'] = pd.to_datetime(input_data['actual_commencement_date'])
        input_data['estimated_finish_date'] = pd.to_datetime(input_data['estimated_finish_date'])
        current_date = datetime.now()
        input_data['duration_until_estimated_finish'] = (input_data['estimated_finish_date'] - input_data['actual_commencement_date']).dt.days
        input_data['duration_since_commencement'] = (current_date - input_data['actual_commencement_date']).dt.days
        input_data['remaining_duration'] = (input_data['estimated_finish_date'] - current_date).dt.days
        input_data['progress_ratio'] = input_data['duration_since_commencement'] / input_data['duration_until_estimated_finish']
        input_data['year_of_commencement'] = input_data['actual_commencement_date'].dt.year
        input_data['month_of_commencement'] = input_data['actual_commencement_date'].dt.month
        input_data['year_of_estimated_finish'] = input_data['estimated_finish_date'].dt.year
        input_data['month_of_estimated_finish'] = input_data['estimated_finish_date'].dt.month
        input_data['days_exceeding_estimated_duration'] = input_data['duration_since_commencement'] - input_data['duration_until_estimated_finish']
        input_data['is_delayed'] = (input_data['remaining_duration'] < 0).astype(int)

        # Select the relevant features
        input_features = input_data[feature_columns]

        # Make the prediction using the trained model to get the probability of each stage
        predicted_probs = gbm_model.predict_proba(input_features)[0]

        # Get all possible stages and the recommended materials for each stage, ordered by probability
        predictions = []
        for i, prob in enumerate(predicted_probs):
            stage = label_to_stage_mapping.get(i, "Unknown Stage")
            recommended_materials = materials_data.loc[materials_data['Activity'].str.contains(stage, case=False, na=False), 'Materials (suggestions)']
            recommended_materials = recommended_materials.values[0] if not recommended_materials.empty else "No materials suggested"
            predictions.append({
                "Stage": stage,
                "Probability": prob,
                "Recommended Materials": recommended_materials
            })

        # Sort the predictions by probability in descending order
        predictions = sorted(predictions, key=lambda x: x['Probability'], reverse=True)

        # Return the results
        return predictions

    except Exception as e:
        print(f"Error predicting current stage: {e}")
        return []


In [41]:
feature_columns = [
    'duration_until_estimated_finish', 'duration_since_commencement', 'remaining_duration',
    'progress_ratio', 'year_of_commencement', 'month_of_commencement',
    'year_of_estimated_finish', 'month_of_estimated_finish',
    'days_exceeding_estimated_duration', 'is_delayed'
]


In [42]:
test_input = {
    "actual_commencement_date": "2023-01-01",
    "estimated_finish_date": "2023-12-31",
}

results = predict_current_stage(test_input, feature_columns)
print(results)


[{'Stage': 'Tiles work', 'Probability': 0.7562216961378654, 'Recommended Materials': 'Tiles'}, {'Stage': 'Painting and Finishing', 'Probability': 0.24309785840189183, 'Recommended Materials': 'paint, putty,primer'}, {'Stage': 'Plastering on outer sides', 'Probability': 0.00034507960045352233, 'Recommended Materials': 'cement, sand'}, {'Stage': 'Handover', 'Probability': 5.3713738533198076e-05, 'Recommended Materials': 'No materials suggested'}, {'Stage': 'Electrical concealed, PVC Fitting, plastering at ground floor', 'Probability': 4.9627991661802246e-05, 'Recommended Materials': 'cement, sand,  circuit pipe, Cpvc,&Pvc pipe'}, {'Stage': 'Cleaning & survey', 'Probability': 3.6924559934658875e-05, 'Recommended Materials': nan}, {'Stage': '3rd Floor slab casting', 'Probability': 3.123673008364434e-05, 'Recommended Materials': 'TMT bar ,cement, sand, aggregates'}, {'Stage': '4th Floor slab casting', 'Probability': 2.7388978265419757e-05, 'Recommended Materials': 'TMT bar ,cement, sand, ag

  recommended_materials = materials_data.loc[materials_data['Activity'].str.contains(stage, case=False, na=False), 'Materials (suggestions)']


In [43]:
from pandas.tseries.offsets import MonthEnd
import numpy as np
import pandas as pd

def predict_current_stage(inputs, feature_columns, gbm_model, label_to_stage_mapping, materials_data):
    try:
        # Creating a dataframe from the inputs
        input_data = pd.DataFrame([inputs])

        # Convert date columns to datetime objects
        input_data['actual_commencement_date'] = pd.to_datetime(input_data['actual_commencement_date'])
        input_data['estimated_finish_date'] = pd.to_datetime(input_data['estimated_finish_date'])

        # Initialize features that do not change within the loop
        input_data['duration_until_estimated_finish'] = (input_data['estimated_finish_date'] - input_data['actual_commencement_date']).dt.days
        input_data['year_of_commencement'] = input_data['actual_commencement_date'].dt.year
        input_data['month_of_commencement'] = input_data['actual_commencement_date'].dt.month
        input_data['year_of_estimated_finish'] = input_data['estimated_finish_date'].dt.year
        input_data['month_of_estimated_finish'] = input_data['estimated_finish_date'].dt.month

        # Create a list to store predictions for each month
        monthly_predictions = []

        # Loop over range of dates from the current date to the estimated finish date with a step size of one month
        current_date = pd.to_datetime("today")
        while current_date <= input_data['estimated_finish_date'].iloc[0]:
            # Update features based on the current date
            input_data['duration_since_commencement'] = (current_date - input_data['actual_commencement_date']).dt.days
            input_data['remaining_duration'] = (input_data['estimated_finish_date'] - current_date).dt.days
            input_data['progress_ratio'] = input_data['duration_since_commencement'] / input_data['duration_until_estimated_finish']
            input_data['days_exceeding_estimated_duration'] = input_data['duration_since_commencement'] - input_data['duration_until_estimated_finish']
            input_data['is_delayed'] = (input_data['remaining_duration'] < 0).astype(int)

            # Select relevant features
            input_features = input_data[feature_columns]

            # Predict using the trained model to get the probability of each stage
            predicted_probs = gbm_model.predict_proba(input_features)[0]

            # Get the most likely stage and the recommended materials
            top_prediction_index = np.argmax(predicted_probs)
            top_stage = label_to_stage_mapping[top_prediction_index]
            top_probability = predicted_probs[top_prediction_index]
            recommended_materials = materials_data.loc[materials_data['Activity'].str.contains(top_stage, case=False, na=False), 'Materials (suggestions)']
            recommended_materials = recommended_materials.values[0] if not recommended_materials.empty else "No materials suggested"

            # Append the prediction for the current date to the list of monthly predictions
            monthly_predictions.append({
                "Date": current_date.strftime('%Y-%m-%d'),
                "Stage": top_stage,
                "Probability": top_probability,
                "Recommended Materials": recommended_materials
            })

            # Move to the next month
            current_date = current_date + MonthEnd(1)

        return monthly_predictions

    except Exception as e:
        print(f"Error predicting current stage: {e}")
        return []

# Test the prediction function with a sample input
test_input = {
    "actual_commencement_date": "2023-08-21",
    "estimated_finish_date": "2024-12-03",
}

results = predict_current_stage(test_input, feature_columns, gbm_model, label_to_stage_mapping, materials_data)
print(results)


[{'Date': '2023-09-27', 'Stage': 'Raft footing, Column B1, Retaining wall Reinforcement ,Concrete pouring ', 'Probability': 0.9999997952300529, 'Recommended Materials': 'TMT bar ,cement, sand, aggregates'}, {'Date': '2023-09-30', 'Stage': 'Raft footing, Column B1, Retaining wall Reinforcement ,Concrete pouring ', 'Probability': 0.9999997910600721, 'Recommended Materials': 'TMT bar ,cement, sand, aggregates'}, {'Date': '2023-10-31', 'Stage': 'Ground Floor slab casting ', 'Probability': 0.9999986250059786, 'Recommended Materials': 'TMT bar ,cement, sand, aggregates ,Pvc pipes,circuit pipes,lightbox, fan box'}, {'Date': '2023-11-30', 'Stage': '2nd floor Columns casting ', 'Probability': 0.9999998738109648, 'Recommended Materials': 'TMT bar ,cement, sand, aggregates'}, {'Date': '2023-12-31', 'Stage': 'Brick work at Basement to Ground Floor ', 'Probability': 0.9999993383237755, 'Recommended Materials': ' blocks/bricks'}, {'Date': '2024-01-31', 'Stage': '3rd Floor slab casting', 'Probability

In [58]:
import numpy as np
import pandas as pd
from pandas.tseries.offsets import MonthEnd
import boto3
from io import StringIO

def predict_current_stage(inputs, feature_columns, gbm_model, label_to_stage_mapping, materials_data):
    try:
        # Creating a dataframe from the inputs
        input_data = pd.DataFrame([inputs])

        # Convert date columns to datetime objects
        input_data['actual_commencement_date'] = pd.to_datetime(input_data['actual_commencement_date'])
        input_data['estimated_finish_date'] = pd.to_datetime(input_data['estimated_finish_date'])

        # Initialize features that do not change within the loop
        input_data['duration_until_estimated_finish'] = (input_data['estimated_finish_date'] - input_data['actual_commencement_date']).dt.days
        input_data['year_of_commencement'] = input_data['actual_commencement_date'].dt.year
        input_data['month_of_commencement'] = input_data['actual_commencement_date'].dt.month
        input_data['year_of_estimated_finish'] = input_data['estimated_finish_date'].dt.year
        input_data['month_of_estimated_finish'] = input_data['estimated_finish_date'].dt.month

        # Create a list to store predictions for each month
        monthly_predictions = []

        # Loop over range of dates from the current date to the estimated finish date with a step size of one month
        current_date = pd.to_datetime("today")
        while current_date <= input_data['estimated_finish_date'].iloc[0]:
            # Update features based on the current date
            input_data['duration_since_commencement'] = (current_date - input_data['actual_commencement_date']).dt.days
            input_data['remaining_duration'] = (input_data['estimated_finish_date'] - current_date).dt.days
            input_data['progress_ratio'] = input_data['duration_since_commencement'] / input_data['duration_until_estimated_finish']
            input_data['days_exceeding_estimated_duration'] = input_data['duration_since_commencement'] - input_data['duration_until_estimated_finish']
            input_data['is_delayed'] = (input_data['remaining_duration'] < 0).astype(int)

            # Select relevant features
            input_features = input_data[feature_columns]

            # Predict using the trained model to get the probability of each stage
            predicted_probs = gbm_model.predict_proba(input_features)[0]

            # Get the most likely stage and the recommended materials
            top_prediction_index = np.argmax(predicted_probs)
            top_stage = label_to_stage_mapping[top_prediction_index]
            top_probability = predicted_probs[top_prediction_index]
            recommended_materials = materials_data.loc[materials_data['Activity'].str.contains(top_stage, case=False, na=False), 'Materials (suggestions)']
            recommended_materials = recommended_materials.values[0] if not recommended_materials.empty else "No materials suggested"

            # Append the prediction for the current date to the list of monthly predictions
            monthly_predictions.append({
                "Date": current_date.strftime('%Y-%m-%d'),
                "Stage": top_stage,
                "Probability": top_probability,
                "Recommended Materials": recommended_materials
            })

            # Move to the next month
            current_date = current_date + MonthEnd(1)

        return monthly_predictions

    except Exception as e:
        print(f"Error predicting current stage: {e}")
        return []

test_input = {
    "actual_commencement_date": "2023-06-28",
    "estimated_finish_date": "2024-12-03",
}

results = predict_current_stage(test_input, feature_columns, gbm_model, label_to_stage_mapping, materials_data)
df_results = pd.DataFrame(results)



In [59]:
csv_buffer = StringIO()
df_results.to_csv(csv_buffer, index=False)
def upload_buffer_to_s3(bucket_name, file_name, buffer):
    try:
        s3 = boto3.resource('s3')
        s3.Object(bucket_name, file_name).put(Body=buffer.getvalue())
        print(f"File {file_name} uploaded successfully to {bucket_name}.")
    except Exception as e:
        print(f"Error uploading buffer to S3: {e}")


In [60]:
bucket_name = "demomlbucket202"   # Replace with your S3 bucket name
file_name_in_s3 = "foryouvijitamam_results.csv"
upload_buffer_to_s3(bucket_name, file_name_in_s3, csv_buffer)


File foryouvijitamam_results.csv uploaded successfully to demomlbucket202.
