In [21]:
import pandas as pd
import numpy as np

# Load your dataset
data = pd.read_csv("/content/new_labeled_dataset.csv")  # Replace with actual dataset path

# Drop unnecessary columns
data = data.drop(columns=["Version", "Item Code", "Item Name","Category","Item ID", "Buyer ID", "Transaction ID", "Overall Revenue",
                          "Return Frequency", "Return Percentage", "Refund Ratio"], errors='ignore')


# Convert Date to datetime & derive 'Return Timing'
data['Date'] = pd.to_datetime(data['Date'], format='%Y-%m-%d')  # Adjust format if needed
current_date = pd.Timestamp.today()
data['Return Timing'] = (current_date - data['Date']).dt.days

# Drop the original Date column (since we now have 'Return Timing')
data = data.drop(columns=['Date'], errors='ignore')

# Ensure there are no string columns left (for model compatibility)
print(data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70052 entries, 0 to 70051
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Final Quantity               70052 non-null  int64  
 1   Total Revenue                70052 non-null  float64
 2   Price Reductions             70052 non-null  float64
 3   Refunds                      70052 non-null  float64
 4   Final Revenue                70052 non-null  float64
 5   Sales Tax                    70052 non-null  float64
 6   Refunded Item Count          70052 non-null  int64  
 7   Purchased Item Count         70052 non-null  int64  
 8   Return Timing                70052 non-null  int64  
 9   Purchased to Refunded Ratio  70052 non-null  float64
 10  Discount Ratio               70052 non-null  float64
 11  Spam                         70052 non-null  int64  
dtypes: float64(7), int64(5)
memory usage: 6.4 MB
None


In [22]:
data.head()

Unnamed: 0,Final Quantity,Total Revenue,Price Reductions,Refunds,Final Revenue,Sales Tax,Refunded Item Count,Purchased Item Count,Return Timing,Purchased to Refunded Ratio,Discount Ratio,Spam
0,1,74.17,0.0,0.0,74.17,14.83,0,1,2118,0.0,0.0,0
1,-1,0.0,0.0,-79.17,-79.17,-15.83,-1,0,2177,-1000000.0,0.0,1
2,-1,0.0,0.0,-74.17,-74.17,-14.83,-1,0,2255,-1000000.0,0.0,1
3,1,79.17,0.0,0.0,79.17,15.83,0,1,2160,0.0,0.0,0
4,1,74.17,0.0,0.0,74.17,14.83,0,1,2257,0.0,0.0,0


In [23]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Define features and target
X = data.drop(columns=['Spam'])  # Features
y = data['Spam']  # Target variable (1 = Spam, 0 = Not Spam)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Train a basic Logistic Regression model
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.81      1.00      0.90      9631
           1       0.99      0.50      0.66      4380

    accuracy                           0.84     14011
   macro avg       0.90      0.75      0.78     14011
weighted avg       0.87      0.84      0.82     14011



In [47]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Split the data into features and target
X = data.drop(columns=['Spam'])  # Drop 'Spam' column to use as target
y = data['Spam']  # Target is the 'Spam' column

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to balance the class distribution by oversampling the minority class
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Apply Tomek Links to remove borderline samples
tomek = TomekLinks()
X_train_resampled, y_train_resampled = tomek.fit_resample(X_train_resampled, y_train_resampled)


In [48]:
from sklearn.ensemble import HistGradientBoostingClassifier

# Initialize model
hgb_model = HistGradientBoostingClassifier(max_iter=100)

# Fit model
hgb_model.fit(X_train_resampled, y_train_resampled)

# Predict
y_hgb = hgb_model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_hgb))


              precision    recall  f1-score   support

           0       0.83      0.96      0.89      9675
           1       0.85      0.56      0.67      4336

    accuracy                           0.83     14011
   macro avg       0.84      0.76      0.78     14011
weighted avg       0.83      0.83      0.82     14011



In [49]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_test, y_hgb)
print(conf_matrix)


[[9252  423]
 [1926 2410]]


In [53]:
import pandas as pd

# Example input data (ensure the input data has the same columns as the training data)
new_data = {
    'Final Quantity': [1],
    'Total Revenue': [74.17],
    'Price Reductions': [0],
    'Refunds': [0],
    'Final Revenue': [74.17],
    'Sales Tax': [0],
    'Refunded Item Count': [0],
    'Purchased Item Count': [1],
    'Return Timing': [5],
    'Purchased to Refunded Ratio': [1.0],
    'Discount Ratio': [0.0]
}

# Convert the input data into a DataFrame
input_df = pd.DataFrame(new_data)

# If you have any preprocessing steps (e.g., encoding), apply them here to input_df
# Example: if you need to scale your data, use the same scaler used during training
# input_df = scaler.transform(input_df)

# Predict using the trained model (hgb_model is the trained Histogram-Based Gradient Boosting model)
prediction = hgb_model.predict(input_df)

# Display the result (1 for spam, 0 for not spam)
if prediction[0] == 1:
    print("The input data is classified as SPAM.")
else:
    print("The input data is classified as NOT SPAM.")


The input data is classified as NOT SPAM.
