In [1]:
import pandas as pd
import numpy as np
import warnings
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

warnings.filterwarnings("ignore")

# Load the dataset
dataset1 = pd.read_csv("new.csv")
df2 = dataset1.copy()

# Convert categorical data into numerical using one-hot encoding
df2 = pd.get_dummies(df2, drop_first=True)

# Define independent and dependent variables
if 'Price (in USD)' in df2.columns:
    indep_X = df2.drop('Price (in USD)', axis=1)
    dep_Y = df2['Price (in USD)']
else:
    raise ValueError("Column 'Price (in USD)' not found in the dataset.")

# Feature Selection using SelectKBest
def selectkbest(X, y, k):
    selector = SelectKBest(score_func=chi2, k=k)
    X_new = selector.fit_transform(X, y)
    selected_features = X.columns[selector.get_support()]
    return X_new, selected_features

# Function to split and scale data
def split_scalar(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test

# Function for model evaluation
def cm_prediction(classifier, X_test, y_test):
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return classifier, accuracy, report, X_test, y_test, cm

# Define ML Algorithms
def logistic(X_train, y_train, X_test, y_test):
    classifier = LogisticRegression(random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def svm_linear(X_train, y_train, X_test, y_test):
    classifier = SVC(kernel='linear', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def svm_NL(X_train, y_train, X_test, y_test):
    classifier = SVC(kernel='rbf', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def Navie(X_train, y_train, X_test, y_test):
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def knn(X_train, y_train, X_test, y_test):
    classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def Decision(X_train, y_train, X_test, y_test):
    classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def random(X_train, y_train, X_test, y_test):
    classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

# Function to store results
def selectk_Classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf):
    return pd.DataFrame({
        'Logistic': acclog,
        'SVMl': accsvml,
        'SVMnl': accsvmnl,
        'KNN': accknn,
        'Navie': accnav,
        'Decision': accdes,
        'Random': accrf
    }, index=['ChiSquare'])

# Feature selection
X_selected, selected_columns = selectkbest(indep_X, dep_Y, 6)

# Split and scale data
X_train, X_test, y_train, y_test = split_scalar(X_selected, dep_Y)

# Store accuracies
acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf = [], [], [], [], [], [], []

# Run models
classifier, Accuracy, report, _, _, _ = logistic(X_train, y_train, X_test, y_test)
acclog.append(Accuracy)

classifier, Accuracy, report, _, _, _ = svm_linear(X_train, y_train, X_test, y_test)
accsvml.append(Accuracy)

classifier, Accuracy, report, _, _, _ = svm_NL(X_train, y_train, X_test, y_test)
accsvmnl.append(Accuracy)

classifier, Accuracy, report, _, _, _ = knn(X_train, y_train, X_test, y_test)
accknn.append(Accuracy)

classifier, Accuracy, report, _, _, _ = Navie(X_train, y_train, X_test, y_test)
accnav.append(Accuracy)

classifier, Accuracy, report, _, _, _ = Decision(X_train, y_train, X_test, y_test)
accdes.append(Accuracy)

classifier, Accuracy, report, _, _, _ = random(X_train, y_train, X_test, y_test)
accrf.append(Accuracy)

# Store final results
result = selectk_Classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf)
print(result)

           Logistic      SVMl     SVMnl       KNN     Navie  Decision   Random
ChiSquare   0.21831  0.288732  0.239437  0.267606  0.239437  0.309859  0.28169
