In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# Load dataset (replace 'your_dataset.csv' with the actual dataset file)
df = pd.read_csv('/content/electricity_new.csv')

# Step 1: Handle missing values
df.fillna(df.median(), inplace=True)

# Step 2: Encode categorical variables
label_encoder = LabelEncoder()
if 'TimeOfDay' in df.columns:
    df['TimeOfDay'] = label_encoder.fit_transform(df['TimeOfDay'])

# Step 3: Separate features and target variable
X = df.drop('Theft', axis=1)
y = df['Theft']

# Step 4: Address class imbalance (if any)
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

# Step 5: Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Step 6: Create a pipeline with scaling and model training
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Step 7: Hyperparameter tuning using GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model from GridSearchCV
best_pipeline = grid_search.best_estimator_

# Step 8: Save the pipeline as a pickle file
with open('theft_prediction_pipeline.pkl', 'wb') as pipeline_file:
    pickle.dump(best_pipeline, pipeline_file)

print("Pipeline saved as a pickle file.")
