In [17]:
import pandas as pd

In [18]:
# Load the cleaned dataset from the 'data' directory
data = pd.read_csv('../data/cleaned_data.csv')

# Display the first few rows to confirm it loaded correctly
print(data.head())

# Get the list of columns to verify the cleaning
print(data.info())

    latitude  longitude  available  capacity  total  capacity_num   cpu_num  \
0  28.568238  77.219666        0.0      15.0    2.0          15.0  4.126608   
1  28.541995  77.260583        0.0       3.0    3.0           3.3  4.126608   
2  28.571189  77.259806        0.0      15.0    2.0          15.0  4.126608   
3  28.588991  77.253240        0.0      15.0    4.0          15.0  4.126608   
4  28.549427  77.254636        0.0      15.0    1.0          15.0  4.126608   

   supports_4w  supports_2w  n_vehicle_types  ...  vendor_name_Powerbank  \
0         True        False                1  ...                  False   
1         True         True                3  ...                  False   
2         True        False                1  ...                  False   
3         True        False                1  ...                  False   
4         True        False                1  ...                  False   

   vendor_name_Pvt. Ltd.  vendor_name_REIL  vendor_name_REVOS  \
0  

In [19]:
# Drop the target variable and any other columns you don't want to use as features
X = data.drop('power_type_DC', axis=1)
# The target variable you want to predict
y = data['power_type_DC']

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Initialize and train a Logistic Regression model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate and print evaluation metrics
print("Evaluation Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"Precision: {precision_score(y_test, y_pred):.2f}")
print(f"Recall: {recall_score(y_test, y_pred):.2f}")
print(f"F1-Score: {f1_score(y_test, y_pred):.2f}")

# Print the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Evaluation Metrics:
Accuracy: 0.95
Precision: 0.77
Recall: 0.98
F1-Score: 0.86

Confusion Matrix:
[[437  23]
 [  2  79]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# Separate numeric and categorical column names
numeric_features = ['latitude', 'longitude', 'available', 'capacity', 'total', 'capacity_num', 'cpu_num', 'n_vehicle_types']
categorical_features = ['supports_4w', 'supports_2w', 'vendor_name_BSES', 'vendor_name_BatterySmart',
                      'vendor_name_BluSmart', 'vendor_name_E-Fill Electric', 'vendor_name_EEE',
                      'vendor_name_EESL', 'vendor_name_ElectriVa', 'vendor_name_GensolCharge Pvt. Ltd.',
                      'vendor_name_HPCL', 'vendor_name_JBM Renewables', 'vendor_name_Jio-bp',
                      'vendor_name_PlugNgo', 'vendor_name_Powerbank', 'vendor_name_Pvt. Ltd.',
                      'vendor_name_REIL', 'vendor_name_REVOS', 'vendor_name_Smart E',
                      'vendor_name_Sun Mobility', 'vendor_name_TPDDL', 'vendor_name_Verdemobility',
                      'station_type_charging']

# Create a preprocessor to scale numeric features and leave categorical features untouched
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features)
    ],
    remainder='passthrough'
)

# Apply the preprocessor to your training and test data
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

# You can now train your models on the scaled data
# For example: model.fit(X_train_scaled, y_train)

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define a dictionary of classification models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000,random_state=42),
    "Decision Tree Classifier": DecisionTreeClassifier(random_state=42),
    "Random Forest Classifier": RandomForestClassifier(random_state=42),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "AdaBoost Classifier": AdaBoostClassifier(random_state=42),
    "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=42)
}

# Loop through each model and evaluate its performance
for name, model in models.items():
    # Train the model
    model.fit(X_train_scaled, y_train)

    # Make predictions on the test set
    y_test_pred = model.predict(X_test_scaled)

    # Calculate and print metrics
    print(f"--- {name} ---")
    print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_test_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_test_pred):.4f}")
    print(f"F1-Score: {f1_score(y_test, y_test_pred):.4f}")
    print("-" * 30)

--- Logistic Regression ---
Accuracy: 0.9538
Precision: 0.7745
Recall: 0.9753
F1-Score: 0.8634
------------------------------
--- Decision Tree Classifier ---
Accuracy: 0.9020
Precision: 0.7000
Recall: 0.6049
F1-Score: 0.6490
------------------------------
--- Random Forest Classifier ---
Accuracy: 0.9464
Precision: 0.7766
Recall: 0.9012
F1-Score: 0.8343
------------------------------
--- K-Neighbors Classifier ---
Accuracy: 0.9538
Precision: 0.7917
Recall: 0.9383
F1-Score: 0.8588
------------------------------
--- AdaBoost Classifier ---
Accuracy: 0.9593
Precision: 0.7980
Recall: 0.9753
F1-Score: 0.8778
------------------------------
--- Gradient Boosting Classifier ---
Accuracy: 0.9538
Precision: 0.7917
Recall: 0.9383
F1-Score: 0.8588
------------------------------


In [24]:
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# Load the cleaned dataset
data = pd.read_csv('../data/cleaned_data.csv')

# Define features (X) and target (y)
X = data.drop('power_type_DC', axis=1)
y = data['power_type_DC']

# Identify numeric and categorical features (same as before)
numeric_features = ['latitude', 'longitude', 'available', 'capacity', 'total', 'capacity_num', 'cpu_num', 'n_vehicle_types']
categorical_features = ['supports_4w', 'supports_2w', 'vendor_name_BSES', 'vendor_name_BatterySmart',
                      'vendor_name_BluSmart', 'vendor_name_E-Fill Electric', 'vendor_name_EEE',
                      'vendor_name_EESL', 'vendor_name_ElectriVa', 'vendor_name_GensolCharge Pvt. Ltd.',
                      'vendor_name_HPCL', 'vendor_name_JBM Renewables', 'vendor_name_Jio-bp',
                      'vendor_name_PlugNgo', 'vendor_name_Powerbank', 'vendor_name_Pvt. Ltd.',
                      'vendor_name_REIL', 'vendor_name_REVOS', 'vendor_name_Smart E',
                      'vendor_name_Sun Mobility', 'vendor_name_TPDDL', 'vendor_name_Verdemobility',
                      'station_type_charging']

# Apply the preprocessor and scale the entire dataset
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features)
    ],
    remainder='passthrough'
)
X_scaled = preprocessor.fit_transform(X)

# Train the final AdaBoost model on the entire scaled dataset
final_model = AdaBoostClassifier(random_state=42)
final_model.fit(X_scaled, y)

print("Final AdaBoost model has been trained on the full dataset.")

Final AdaBoost model has been trained on the full dataset.


In [25]:
import joblib
import os

# Create a 'models' directory if it doesn't exist
if not os.path.exists('models'):
    os.makedirs('models')

# Save the trained model to a file
joblib.dump(final_model, 'models/socket_prediction/artifacts/adaboost_model.pkl')

# It's also critical to save the preprocessor!
# You must use the same preprocessor to transform new data before making predictions.
joblib.dump(preprocessor, 'models/preprocessor.pkl')

print("Model and preprocessor have been successfully saved.")

FileNotFoundError: [Errno 2] No such file or directory: 'models/socket_prediction/artifacts/adaboost_model.pkl'