In [1]:
# CODTECH Internship Task-3
## End to End Data Science Project Deployment
#Name:** Shaunak Damodar Sinai Kunde  
#Internship:** CodTech IT Solutions Pvt Ltd  
#Task:** Data Preprocessing, Transformation & Loading (ETL) Pipeline

In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from datetime import timedelta
import numpy as np
import os

# --- 1. Data Loading and Preprocessing ---
print("Step 1: Loading and preprocessing data...")

# The file name is provided from the user's upload.
file_path = "C://Users//kunde//Desktop//Virtual internship//CodTech IT Solutions Pvt Ltd//CodTech IT Solutions Pvt Ltd Internship//Task-3 End to end Data Science Project - Copy//Goa Power Outage Report June 2025.xlsx"
try:
    df = pd.read_excel(file_path)
    print(f"Successfully loaded data from {file_path}")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
    exit()

# Select the features and the target variable as per the request.
features = ['Town Name', 'Substation', 'Feeder Name', 'Rural/Urban']
target = 'Average_Hours_of_Steady_Supply'
df = df[features + [target]]

# Clean up 'Substation' and 'Feeder Name' which may contain newline characters
df['Substation'] = df['Substation'].str.replace('\n', ' ', regex=False).str.strip()
df['Feeder Name'] = df['Feeder Name'].str.replace('\n', ' ', regex=False).str.strip()

# Handle missing values in features with a placeholder
for col in features:
    df[col].fillna('Unknown', inplace=True)

# Convert the target variable (HH:MM:SS) to total seconds for the model to use.
# This makes it a numerical regression problem.
def time_to_seconds(time_str):
    if pd.isna(time_str):
        return 0
    try:
        # Pad with 0s if necessary to ensure HH:MM:SS format
        parts = [p.zfill(2) for p in str(time_str).split(':')]
        h, m, s = 0, 0, 0
        if len(parts) == 3:
            h, m, s = int(parts[0]), int(parts[1]), int(parts[2])
        elif len(parts) == 2:
            m, s = int(parts[0]), int(parts[1])
        elif len(parts) == 1:
            s = int(parts[0])
        return h * 3600 + m * 60 + s
    except (ValueError, IndexError):
        # Handle invalid time strings by returning 0 or an appropriate value
        return 0

df[target] = df[target].apply(time_to_seconds)

# Define X (features) and y (target)
X = df[features]
y = df[target]

# Identify categorical features for one-hot encoding
categorical_features = ['Town Name', 'Substation', 'Feeder Name', 'Rural/Urban']

# Create a preprocessor using ColumnTransformer with OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# Fit the preprocessor on the data
X_processed = preprocessor.fit_transform(X)
print("Data preprocessing complete.")

# --- 2. Model Training and Evaluation ---
print("Step 2: Training the machine learning model...")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Initialize and train a RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model (optional but good practice)
score = model.score(X_test, y_test)
print(f"Model training complete. R^2 Score on test set: {score:.2f}")

# --- 3. Save the Model and Preprocessor ---
print("Step 3: Saving the model and preprocessor...")

# Save the trained model and the preprocessor to a .pkl file
joblib.dump(model, 'model.pkl')
joblib.dump(preprocessor, 'preprocessor.pkl')

print("Model saved to 'model.pkl' and preprocessor saved to 'preprocessor.pkl'.")
print("You can now use 'app.py' to host the model with Streamlit.")


Step 1: Loading and preprocessing data...
Successfully loaded data from C://Users//kunde//Desktop//Virtual internship//CodTech IT Solutions Pvt Ltd//CodTech IT Solutions Pvt Ltd Internship//Task-3 End to end Data Science Project - Copy//Goa Power Outage Report June 2025.xlsx
Data preprocessing complete.
Step 2: Training the machine learning model...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Unknown', inplace=True)


Model training complete. R^2 Score on test set: -0.06
Step 3: Saving the model and preprocessor...
Model saved to 'model.pkl' and preprocessor saved to 'preprocessor.pkl'.
You can now use 'app.py' to host the model with Streamlit.
