In [None]:
#importing required packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

In [None]:
#Load the dataset
data=pd.read_csv("/content/drive/MyDrive/OBtest/onlinefoods.csv")
data.head()

In [38]:
# Encode categorical variables since Standard scaler doesn't accept strings
label_encoder = LabelEncoder()
data['Gender'] = label_encoder.fit_transform(data['Gender'])
data['Marital Status'] = label_encoder.fit_transform(data['Marital Status'])
data['Occupation'] = label_encoder.fit_transform(data['Occupation'])
data['Educational Qualifications'] = label_encoder.fit_transform(data['Educational Qualifications'])
data['Unnamed: 12'] = label_encoder.fit_transform(data['Unnamed: 12'])

In [41]:
# Split features and target variable
#X does not have target variable and ununsed variable
X = data.drop(['Monthly Income', 'Pin code', 'Output', 'Feedback'], axis=1)
#print(X)
y = data['Monthly Income']

In [42]:
# Check for any remaining string values
remaining_strings = X.select_dtypes(include=['object']).columns
if not remaining_strings.empty:
    print("Remaining string values:", remaining_strings)


In [43]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Initialize classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Multi-layer Perceptron": MLPClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(),
    "Naive Bayes": GaussianNB()
}

# Train and evaluate classifiers
results = {}
roc={}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)  #Train each model
    y_pred = clf.predict(X_test) #Test each model
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"\n{name} ==> Accuracy: {accuracy}\n")
    print(classification_report(y_test, y_pred))


In [None]:
print(results)

In [None]:
# Choose the best performing model
best_model = max(results, key=results.get)
print(f"Best Model: {best_model}")

In [None]:
#Visualize features
#MLP doesn't have coef_
if best_model != "Multi-layer Perceptron":  # Check if the best model is not MLPClassifier
    feature_names = X.columns
    plt.figure(figsize=(10, 6))
    plt.barh(feature_names, classifiers[best_model].coef_[0])
    plt.xlabel("Coefficient Magnitude")
    plt.ylabel("Feature")
    plt.title("Feature Importance")
    plt.show()
else:
    # For MLPClassifier, visualize the absolute values of the weights of the connections between input and first hidden layer
    feature_importance = np.abs(classifiers[best_model].coefs_[0])
    plt.figure(figsize=(10, 6))
    plt.barh(feature_names, feature_importance.sum(axis=1))
    plt.xlabel("Importance")
    plt.ylabel("Feature")
    plt.title("Feature Importance")
    plt.show()


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score

# Initialize KMeans clustering algorithm
kmeans = KMeans(n_clusters=3, random_state=42)

# Fit KMeans to the data
kmeans.fit(X)

# Predict cluster labels
cluster_labels = kmeans.labels_

# Evaluate clustering performance
silhouette = silhouette_score(X, cluster_labels)
davies_bouldin = davies_bouldin_score(X, cluster_labels)

print(f"Silhouette Score: {silhouette}")
print(f"Davies-Bouldin Index: {davies_bouldin}")
