In [None]:
import pandas as pd
import numpy as np

data = pd.read_csv("/content/new_labeled_dataset.csv")

data = data.drop(columns=["Version", "Item Code", "Item Name","Category","Item ID", "Buyer ID", "Transaction ID", "Overall Revenue",
                          "Return Frequency", "Return Percentage", "Refund Ratio"], errors='ignore')

data['Date'] = pd.to_datetime(data['Date'], format='%Y-%m-%d')
current_date = pd.Timestamp.today()
data['Return Timing'] = (current_date - data['Date']).dt.days

data = data.drop(columns=['Date'], errors='ignore')

print(data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70052 entries, 0 to 70051
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Final Quantity               70052 non-null  int64  
 1   Total Revenue                70052 non-null  float64
 2   Price Reductions             70052 non-null  float64
 3   Refunds                      70052 non-null  float64
 4   Final Revenue                70052 non-null  float64
 5   Sales Tax                    70052 non-null  float64
 6   Refunded Item Count          70052 non-null  int64  
 7   Purchased Item Count         70052 non-null  int64  
 8   Return Timing                70052 non-null  int64  
 9   Purchased to Refunded Ratio  70052 non-null  float64
 10  Discount Ratio               70052 non-null  float64
 11  Spam                         70052 non-null  int64  
dtypes: float64(7), int64(5)
memory usage: 6.4 MB
None


In [None]:
data.head()

Unnamed: 0,Final Quantity,Total Revenue,Price Reductions,Refunds,Final Revenue,Sales Tax,Refunded Item Count,Purchased Item Count,Return Timing,Purchased to Refunded Ratio,Discount Ratio,Spam
0,1,74.17,0.0,0.0,74.17,14.83,0,1,2120,0.0,0.0,0
1,-1,0.0,0.0,-79.17,-79.17,-15.83,-1,0,2179,-1000000.0,0.0,1
2,-1,0.0,0.0,-74.17,-74.17,-14.83,-1,0,2257,-1000000.0,0.0,1
3,1,79.17,0.0,0.0,79.17,15.83,0,1,2162,0.0,0.0,0
4,1,74.17,0.0,0.0,74.17,14.83,0,1,2259,0.0,0.0,0


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = data.drop(columns=['Spam'])
y = data['Spam']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.81      1.00      0.90      9631
           1       0.99      0.50      0.66      4380

    accuracy                           0.84     14011
   macro avg       0.90      0.75      0.78     14011
weighted avg       0.87      0.84      0.82     14011



In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Split the data into features and target
X = data.drop(columns=['Spam'])
y = data['Spam']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Apply Tomek Links to remove borderline samples
tomek = TomekLinks()
X_train_resampled, y_train_resampled = tomek.fit_resample(X_train_resampled, y_train_resampled)


In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier
hgb_model = HistGradientBoostingClassifier(max_iter=100)
hgb_model.fit(X_train_resampled, y_train_resampled)
y_hgb = hgb_model.predict(X_test)
print(classification_report(y_test, y_hgb))


              precision    recall  f1-score   support

           0       0.83      0.95      0.88      9675
           1       0.83      0.56      0.67      4336

    accuracy                           0.83     14011
   macro avg       0.83      0.75      0.78     14011
weighted avg       0.83      0.83      0.82     14011



In [None]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_test, y_hgb)
print(conf_matrix)


In [None]:
import pickle

# Save the trained model
with open('return_verification_model.pkl', 'wb') as file:
    pickle.dump(hgb_model, file)


In [13]:
pip install pymongo

Collecting pymongo
  Downloading pymongo-4.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading pymongo-4.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.7.0 pymongo-4.11


In [14]:
import pandas as pd
from pymongo import MongoClient
import pickle

# Load the pre-trained model from the pickle file
with open('return_verification_model.pkl', 'rb') as f:
    model = pickle.load(f)

# MongoDB Atlas connection details
uri = "mongodb+srv://padmajaatms:Khacks@cluster0.bqj6t.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
client = MongoClient(uri)
db = client.Users
collection = db.Khacks

# Function to fetch user data and make predictions
def predict_spam(user_id):
    user_data = collection.find_one({"user_id": user_id})

    if not user_data:
        return "User ID not found in the database."

    # Extract the required fields for the model
    input_data = {
        'Final Quantity': [user_data['Final Quantity']],
        'Total Revenue': [user_data['Total Revenue']],
        'Price Reductions': [user_data['Price Reductions']],
        'Refunds': [user_data['Refunds']],
        'Final Revenue': [user_data['Final Revenue']],
        'Sales Tax': [user_data['Sales Tax']],
        'Refunded Item Count': [user_data['Refunded Item Count']],
        'Purchased Item Count': [user_data['Purchased Item Count']],
        'Return Timing': [user_data['Return Timing']],
        'Purchased to Refunded Ratio': [user_data['Purchased to Refunded Ratio']],
        'Discount Ratio': [user_data['Discount Ratio']]
    }

    # Convert the input data into a DataFrame
    input_df = pd.DataFrame(input_data)

    # Make a prediction using the pre-trained model
    prediction = model.predict(input_df)

    # Return the result
    if prediction[0] == 1:
        return f"User ID {user_id} is classified as SPAM."
    else:
        return f"User ID {user_id} is classified as NOT SPAM."

# Example usage
user_id = int(input("Enter User ID: "))  # Take user ID as input
result = predict_spam(user_id)
print(result)

Enter User ID: 1069
User ID 1069 is classified as NOT SPAM.
