In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.multioutput import MultiOutputClassifier

# Load the dataset
df = pd.read_csv(r"C:\Users\ragul\OneDrive\Desktop\hack\AI_Revenue_Leakage_Detection\model\Telecom\dataset\telecom_billing_dataset.csv")

# Separate features and target variables
X = df.drop(columns=['Anomaly_type', 'Leakage'])
y = df[['Anomaly_type', 'Leakage']].copy()

# Handle any potential missing values in target columns
y['Anomaly_type'] = y['Anomaly_type'].fillna('Unknown')
y['Leakage'] = y['Leakage'].fillna('Unknown')

# Convert date columns to datetime objects and extract features
for col in ['Billing_date', 'Plan_start_date', 'Plan_end_date']:
    X[col] = pd.to_datetime(X[col], dayfirst=True)
    X[col + '_year'] = X[col].dt.year
    X[col + '_month'] = X[col].dt.month
    X[col + '_day'] = X[col].dt.day
X = X.drop(columns=['Billing_date', 'Plan_start_date', 'Plan_end_date'])

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object', 'category']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns


# Preprocessing pipelines for numerical and categorical features
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a preprocessor object using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Encode the target labels
le_anomaly = LabelEncoder()
le_leakage = LabelEncoder()
y.loc[:, 'Anomaly_type'] = le_anomaly.fit_transform(y['Anomaly_type'])
y.loc[:, 'Leakage'] = le_leakage.fit_transform(y['Leakage'])


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
base_model = RandomForestClassifier(n_estimators=100, random_state=42)
model = MultiOutputClassifier(base_model)


# Create the full pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

# --- FIX IS HERE ---
# Train the model using y_train.values to pass a NumPy array
pipeline.fit(X_train, y_train.values)
# --- END OF FIX ---


# Make predictions
y_pred = pipeline.predict(X_test)

# Convert predictions back to original labels for interpretation
y_pred_df = pd.DataFrame(y_pred, columns=['Anomaly_type', 'Leakage'])
y_pred_df['Anomaly_type'] = le_anomaly.inverse_transform(y_pred_df['Anomaly_type'])
y_test_df = pd.DataFrame(y_test).reset_index(drop=True)
y_test_df['Anomaly_type'] = le_anomaly.inverse_transform(y_test['Anomaly_type'])
y_pred_df['Leakage'] = le_leakage.inverse_transform(y_pred_df['Leakage'])
y_test_df['Leakage'] = le_leakage.inverse_transform(y_test['Leakage'])

# Evaluate the model for 'Anomaly_type'
print("Evaluation for 'Anomaly_type':")
print(classification_report(y_test_df['Anomaly_type'], y_pred_df['Anomaly_type']))
print("Accuracy for 'Anomaly_type':", accuracy_score(y_test_df['Anomaly_type'], y_pred_df['Anomaly_type']))


# Evaluate the model for 'Leakage'
print("\nEvaluation for 'Leakage':")
print(classification_report(y_test_df['Leakage'], y_pred_df['Leakage']))
print("Accuracy for 'Leakage':", accuracy_score(y_test_df['Leakage'], y_pred_df['Leakage']))

ValueError: Unknown label type: unknown. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier

# Load the dataset
df = pd.read_csv(r"C:\Users\ragul\OneDrive\Desktop\hack\AI_Revenue_Leakage_Detection\model\Telecom\dataset\telecom_billing_dataset.csv")
df['Invoice_Num_Int'] = df['Invoice_number'].str.replace("INV", "").astype(int)
df = df.sort_values(by='Invoice_Num_Int').reset_index(drop=True)

# Step 3: Create Is_Duplicate flag (check previous/next after sorting)
df['Is_Duplicate'] = (
    (df['Invoice_number'] == df['Invoice_number'].shift(1)) | 
    (df['Invoice_number'] == df['Invoice_number'].shift(-1))
).astype(int)
# --- ROBUST DATA PREPARATION ---
# Separate features (X) and targets (y)
X = df.drop(columns=['Anomaly_type', 'Leakage'])
y = df[['Anomaly_type', 'Leakage']].copy()

# 1. Handle potential missing values in target columns first
y['Anomaly_type'] = y['Anomaly_type'].fillna('Unknown')
y['Leakage'] = y['Leakage'].fillna('Unknown')

# 2. Ensure target columns are of string type before encoding
y['Anomaly_type'] = y['Anomaly_type'].astype(str)
y['Leakage'] = y['Leakage'].astype(str)
# --- END OF PREPARATION ---

# Feature engineering for date columns
for col in ['Billing_date', 'Plan_start_date', 'Plan_end_date']:
    X[col] = pd.to_datetime(X[col], dayfirst=True)
    X[col + '_year'] = X[col].dt.year
    X[col + '_month'] = X[col].dt.month
    X[col + '_day'] = X[col].dt.day
X = X.drop(columns=['Billing_date', 'Plan_start_date', 'Plan_end_date'])

# Identify categorical and numerical features after date processing
categorical_features = X.select_dtypes(include=['object', 'category']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Create preprocessing pipelines
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Encode the target labels
le_anomaly = LabelEncoder()
le_leakage = LabelEncoder()
y['Anomaly_type'] = le_anomaly.fit_transform(y['Anomaly_type'])
y['Leakage'] = le_leakage.fit_transform(y['Leakage'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- DEFINE THE XGBOOST MODEL ---
# XGBoost is a powerful gradient boosting model
base_model = XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=42)
model = MultiOutputClassifier(base_model)
# --- END OF MODEL DEFINITION ---

# Create and train the full pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

pipeline.fit(X_train, y_train)

# Make and evaluate predictions
y_pred = pipeline.predict(X_test)

# Inverse transform labels for clear reporting
y_pred_df = pd.DataFrame(y_pred, columns=['Anomaly_type', 'Leakage'])
y_pred_df['Anomaly_type'] = le_anomaly.inverse_transform(y_pred_df['Anomaly_type'])
y_test_df = pd.DataFrame(y_test.values, columns=['Anomaly_type', 'Leakage'])
y_test_df['Anomaly_type'] = le_anomaly.inverse_transform(y_test_df['Anomaly_type'])
y_pred_df['Leakage'] = le_leakage.inverse_transform(y_pred_df['Leakage'])
y_test_df['Leakage'] = le_leakage.inverse_transform(y_test_df['Leakage'])


# --- DISPLAY RESULTS ---
print("--- Evaluation for 'Anomaly_type' ---")
print(classification_report(y_test_df['Anomaly_type'], y_pred_df['Anomaly_type']))
print(f"Accuracy for 'Anomaly_type': {accuracy_score(y_test_df['Anomaly_type'], y_pred_df['Anomaly_type']):.4f}")

print("\n--- Evaluation for 'Leakage' ---")
print(classification_report(y_test_df['Leakage'], y_pred_df['Leakage']))
print(f"Accuracy for 'Leakage': {accuracy_score(y_test_df['Leakage'], y_pred_df['Leakage']):.4f}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- Evaluation for 'Anomaly_type' ---
                   precision    recall  f1-score   support

Duplicate entries       1.00      1.00      1.00       125
   Excess payment       1.00      1.00      1.00       137
 Extra data usage       1.00      0.99      1.00       106
  Missing charges       1.00      0.99      1.00       153
       No anomaly       0.99      1.00      0.99      1198
    Under payment       1.00      1.00      1.00       174
   Usage mismatch       1.00      0.88      0.94       107

         accuracy                           0.99      2000
        macro avg       1.00      0.98      0.99      2000
     weighted avg       0.99      0.99      0.99      2000

Accuracy for 'Anomaly_type': 0.9925

--- Evaluation for 'Leakage' ---
              precision    recall  f1-score   support

          No       0.99      1.00      0.99      1198
         Yes       1.00      0.98      0.99       802

    accuracy                           0.99      2000
   macro avg       0.9

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier

# Load the dataset
df = pd.read_csv(r"C:\Users\ragul\OneDrive\Desktop\hack\AI_Revenue_Leakage_Detection\model\Telecom\dataset\telecom_billing_dataset.csv")

# --- ROBUST DATA PREPARATION ---
X = df.drop(columns=['Anomaly_type', 'Leakage'])
y = df[['Anomaly_type', 'Leakage']].copy()

# 1. Handle potential missing values in target columns
y['Anomaly_type'] = y['Anomaly_type'].fillna('Unknown')
y['Leakage'] = y['Leakage'].fillna('Unknown')

# 2. Ensure target columns are of string type before encoding
y['Anomaly_type'] = y['Anomaly_type'].astype(str)
y['Leakage'] = y['Leakage'].astype(str)

# --- FIX: REMOVE LEAKY FEATURES ---
# These columns are too direct and "leak" information about the target
X = X.drop(columns=['Billed_amount', 'Paid_amount', 'Balance_amount'])
# --- END OF FIX ---


# Feature engineering for date columns
for col in ['Billing_date', 'Plan_start_date', 'Plan_end_date']:
    X[col] = pd.to_datetime(X[col], dayfirst=True)
    X[col + '_year'] = X[col].dt.year
    X[col + '_month'] = X[col].dt.month
    X[col + '_day'] = X[col].dt.day
X = X.drop(columns=['Billing_date', 'Plan_start_date', 'Plan_end_date'])

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object', 'category']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Create preprocessing pipelines
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Encode the target labels
le_anomaly = LabelEncoder()
le_leakage = LabelEncoder()
y['Anomaly_type'] = le_anomaly.fit_transform(y['Anomaly_type'])
y['Leakage'] = le_leakage.fit_transform(y['Leakage'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the XGBoost model
base_model = XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=42)
model = MultiOutputClassifier(base_model)

# Create and train the full pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

pipeline.fit(X_train, y_train)

# Make and evaluate predictions
y_pred = pipeline.predict(X_test)

# Inverse transform labels for clear reporting
y_pred_df = pd.DataFrame(y_pred, columns=['Anomaly_type', 'Leakage'])
y_pred_df['Anomaly_type'] = le_anomaly.inverse_transform(y_pred_df['Anomaly_type'])
y_test_df = pd.DataFrame(y_test.values, columns=['Anomaly_type', 'Leakage'])
y_test_df['Anomaly_type'] = le_anomaly.inverse_transform(y_test_df['Anomaly_type'])
y_pred_df['Leakage'] = le_leakage.inverse_transform(y_pred_df['Leakage'])
y_test_df['Leakage'] = le_leakage.inverse_transform(y_test_df['Leakage'])

# --- DISPLAY RESULTS ---
print("--- Evaluation for 'Anomaly_type' ---")
print(classification_report(y_test_df['Anomaly_type'], y_pred_df['Anomaly_type']))
print(f"Accuracy for 'Anomaly_type': {accuracy_score(y_test_df['Anomaly_type'], y_pred_df['Anomaly_type']):.4f}")

print("\n--- Evaluation for 'Leakage' ---")
print(classification_report(y_test_df['Leakage'], y_pred_df['Leakage']))
print(f"Accuracy for 'Leakage': {accuracy_score(y_test_df['Leakage'], y_pred_df['Leakage']):.4f}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- Evaluation for 'Anomaly_type' ---
                   precision    recall  f1-score   support

Duplicate entries       1.00      1.00      1.00       112
   Excess payment       0.33      0.03      0.06       129
 Extra data usage       1.00      0.98      0.99       128
  Missing charges       0.19      0.02      0.04       166
       No anomaly       0.80      0.98      0.89      1203
    Under payment       1.00      1.00      1.00       152
   Usage mismatch       1.00      0.96      0.98       110

         accuracy                           0.84      2000
        macro avg       0.76      0.71      0.71      2000
     weighted avg       0.77      0.84      0.79      2000

Accuracy for 'Anomaly_type': 0.8440

--- Evaluation for 'Leakage' ---
              precision    recall  f1-score   support

          No       0.81      0.95      0.87      1203
         Yes       0.89      0.67      0.76       797

    accuracy                           0.84      2000
   macro avg       0.8

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier

# --- IMPORTANT: Import from imblearn ---
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline


# Load the dataset
df = pd.read_csv(r"C:\Users\ragul\OneDrive\Desktop\hack\AI_Revenue_Leakage_Detection\model\Telecom\dataset\telecom_billing_dataset.csv")

# --- DATA PREPARATION ---
X = df.drop(columns=['Anomaly_type', 'Leakage'])
y = df[['Anomaly_type', 'Leakage']].copy()

# Handle potential missing values and ensure string type
y['Anomaly_type'] = y['Anomaly_type'].fillna('Unknown').astype(str)
y['Leakage'] = y['Leakage'].fillna('Unknown').astype(str)

# Remove leaky features as before
X = X.drop(columns=['Billed_amount', 'Paid_amount', 'Balance_amount'])

# Feature engineering for date columns
for col in ['Billing_date', 'Plan_start_date', 'Plan_end_date']:
    X[col] = pd.to_datetime(X[col], dayfirst=True)
    X[col + '_year'] = X[col].dt.year
    X[col + '_month'] = X[col].dt.month
    X[col + '_day'] = X[col].dt.day
X = X.drop(columns=['Billing_date', 'Plan_start_date', 'Plan_end_date'])

# --- LABEL POWERSET TRANSFORMATION ---
# Combine the two target columns into a single target for SMOTE
y_combined = y['Anomaly_type'] + '_' + y['Leakage']
le_combined = LabelEncoder()
y_encoded = le_combined.fit_transform(y_combined)


# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object', 'category']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Create preprocessing pipelines for features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# --- CREATE PIPELINE WITH SMOTE AND XGBOOST ---
# Note: We are using the Pipeline from imblearn
model = XGBClassifier(objective='multi:softprob', eval_metric='mlogloss', use_label_encoder=False, random_state=42)
smote = SMOTE(random_state=42)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('smote', smote),
                           ('classifier', model)])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred_encoded = pipeline.predict(X_test)

# --- REVERSE THE TRANSFORMATION FOR EVALUATION ---
# Convert predictions back to the combined string labels
y_pred_combined = le_combined.inverse_transform(y_pred_encoded)

# Split the combined labels back into two separate columns
y_pred_split = pd.DataFrame(y_pred_combined, columns=['combined']).combined.str.split('_', expand=True)
y_pred_anomaly = y_pred_split[0]
y_pred_leakage = y_pred_split[1]

# Do the same for the true test labels
y_test_combined = le_combined.inverse_transform(y_test)
y_test_split = pd.DataFrame(y_test_combined, columns=['combined']).combined.str.split('_', expand=True)
y_test_anomaly = y_test_split[0]
y_test_leakage = y_test_split[1]


# --- DISPLAY RESULTS ---
print("--- Evaluation for 'Anomaly_type' ---")
print(classification_report(y_test_anomaly, y_pred_anomaly))
print(f"Accuracy for 'Anomaly_type': {accuracy_score(y_test_anomaly, y_pred_anomaly):.4f}")

print("\n--- Evaluation for 'Leakage' ---")
print(classification_report(y_test_leakage, y_pred_leakage))
print(f"Accuracy for 'Leakage': {accuracy_score(y_test_leakage, y_pred_leakage):.4f}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- Evaluation for 'Anomaly_type' ---
                   precision    recall  f1-score   support

Duplicate entries       1.00      1.00      1.00       120
   Excess payment       0.00      0.00      0.00       120
 Extra data usage       1.00      0.98      0.99       120
  Missing charges       0.20      0.01      0.01       160
       No anomaly       0.80      1.00      0.89      1200
    Under payment       1.00      1.00      1.00       160
   Usage mismatch       1.00      0.92      0.96       120

         accuracy                           0.85      2000
        macro avg       0.71      0.70      0.69      2000
     weighted avg       0.76      0.85      0.79      2000

Accuracy for 'Anomaly_type': 0.8520

--- Evaluation for 'Leakage' ---
              precision    recall  f1-score   support

          No       0.80      1.00      0.89      1200
         Yes       0.99      0.64      0.78       800

    accuracy                           0.85      2000
   macro avg       0.9