In [4]:
!python setup.py build_ext --inplace
import numpy as np
import pandas as pd

train_data = pd.read_csv('/content/train.csv', encoding='latin-1')
test_data = pd.read_csv('/content/test.csv', encoding='latin-1')

print("Train Data Length:", len(train_data))
print("Test Data Length:", len(test_data))

print("Train Data:")
print(train_data.head())
print("\nTest Data:")
print(test_data.head())
X_train=pd.DataFrame()
X_train = train_data.drop('price_range', axis=1)
y_train = train_data['price_range']

X_test = test_data.drop('id', axis=1)
test_id = test_data['id']

print("Unique values in y_train:", pd.unique(y_train))
mapping = {0: 0, 1: 1, 2: 1, 3: 0}
y_train_mapped = y_train.map(mapping).astype(int)

num_particles = 20
max_iter = 100
w = 0.5
c1 = 2
c2 = 2

def initialize_particles(num_particles, num_features):
    return np.random.choice([0, 1], size=(num_particles, num_features))

def fitness(particle, X_train, y_train_mapped):

    selected_features = X_train.columns[particle == 1]
    if selected_features.empty:
        return 0
    X_train_selected = X_train[selected_features]
    dt_classifier = CustomDecisionTreeClassifier()
    dt_classifier.fit(X_train_selected.values, y_train_mapped)
    accuracy = dt_classifier.score(X_train_selected.values, y_train_mapped)
    return accuracy

def update_particle(particle, velocity, best_particle, global_best_particle):
    r1, r2 = np.random.random(size=2)
    velocity = w * velocity + c1 * r1 * (best_particle - particle) + c2 * r2 * (global_best_particle - particle)
    particle = np.where(np.random.random(len(particle)) < 1 / (1 + np.exp(-velocity)), 1, 0)
    return particle, velocity

def pso_feature_selection(X_train, y_train_mapped, X_test):
    num_features = X_train.shape[1]
    particles = initialize_particles(num_particles, num_features)
    velocity = np.zeros((num_particles, num_features))
    global_best_particle = particles[0]
    global_best_fitness = fitness(global_best_particle, X_train, y_train_mapped)
    best_particles = particles.copy()

    for _ in range(max_iter):
        for i, particle in enumerate(particles):
            particle_fitness = fitness(particle, X_train, y_train_mapped)
            if particle_fitness > global_best_fitness:
                global_best_fitness = particle_fitness
                global_best_particle = particle.copy()
            if particle_fitness > fitness(best_particles[i], X_train, y_train_mapped):
                best_particles[i] = particle.copy()
        for i, particle in enumerate(particles):
            particles[i], velocity[i] = update_particle(particle, velocity[i], best_particles[i], global_best_particle)

    selected_features = X_train.columns[global_best_particle == 1]
    X_test_selected = X_test[selected_features]
    return selected_features, X_test_selected

selected_features_train, X_test_selected = pso_feature_selection(X_train, y_train_mapped, X_test)

X_train_selected = X_train[selected_features_train].values.astype(float)
X_test_selected = X_test_selected.values.astype(float)
y_train = y_train_mapped.values.astype(int)

python3: can't open file '/content/setup.py': [Errno 2] No such file or directory
Train Data Length: 2000
Test Data Length: 1000
Train Data:
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263      1716  2603    11     2          9   
3        131        6  ...       1216      1786  2769    1

In [3]:
import numpy as np
from sklearn.metrics import accuracy_score

class CustomDecisionTreeClassifier:
    def __init__(self, max_depth=3):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(np.array(X, dtype=np.float32), np.array(y, dtype=np.int32))

    def _build_tree(self, X, y, depth=0):
        if depth == self.max_depth or len(np.unique(y)) == 1:
            return np.bincount(y).argmax()
        best_feature, best_threshold = self._find_best_split(X, y)
        if best_feature == -1:
            return np.bincount(y).argmax()

        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        left_tree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_tree = self._build_tree(X[right_indices], y[right_indices], depth + 1)
        return (best_feature, best_threshold, left_tree, right_tree)

    def _find_best_split(self, X, y):
        best_feature = -1
        best_threshold = -1.0
        best_gini = float('inf')
        m, n = X.shape
        num_classes = len(np.unique(y))

        total_class_count = np.bincount(y, minlength=num_classes).astype(np.int64)
        left_class_count = np.zeros(num_classes, dtype=np.int64)
        right_class_count = total_class_count.copy()

        for feature in range(n):
            sorted_indices = np.argsort(X[:, feature])
            sorted_X = X[sorted_indices, feature]
            sorted_y = y[sorted_indices]

            left_class_count[:] = 0
            right_class_count[:] = total_class_count
            total_left = 0
            total_right = m

            for i in range(1, m):
                c = sorted_y[i - 1]
                left_class_count[c] += 1
                right_class_count[c] -= 1
                total_left += 1
                total_right -= 1

                if sorted_X[i] == sorted_X[i - 1]:
                    continue

                gini_left = 1.0 - np.sum((left_class_count / total_left) ** 2)
                gini_right = 1.0 - np.sum((right_class_count / total_right) ** 2)
                gini = (total_left * gini_left + total_right * gini_right) / m

                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature
                    best_threshold = (sorted_X[i] + sorted_X[i - 1]) / 2

        return best_feature, best_threshold

    def predict(self, X):
        return np.array([self._predict_sample(sample, self.tree) for sample in np.array(X, dtype=np.float32)])

    def _predict_sample(self, sample, tree):
        if isinstance(tree, tuple):
            feature, threshold, left_tree, right_tree = tree
            if sample[feature] <= threshold:
                return self._predict_sample(sample, left_tree)
            else:
                return self._predict_sample(sample, right_tree)
        else:
            return tree

    def score(self, X, y):
        return accuracy_score(np.array(y), self.predict(np.array(X)))


In [5]:
from scipy.linalg import eigh
import sympy as sp
import numpy as np
def calculate_eigenvalues(cov_matrix):
    x = sp.symbols('x')
    lamda_mat = sp.eye(*cov_matrix.shape) * x
    result_matrix = lamda_mat - cov_matrix
    determinant = sp.Matrix(result_matrix).det()
    eigenvalues = sp.solve(determinant, x)
    eigenvalues = np.array(eigenvalues)
    return eigenvalues

def calculate_eigenvectors(cov_matrix):
    _, eigenvectors = np.linalg.eigh(cov_matrix)
    return eigenvectors

class LDA:
    def __init__(self, n_components=None):
        self.n_components = n_components

    def fit(self, X, y):
        n_samples, n_features = X.shape
        classes, cls_counts = np.unique(y, return_counts=True)
        priors = cls_counts / n_samples # nesba bta3t kol class

        X_cls_mean = np.array([X[y == cls].mean(axis=0) for cls in classes]) # mean of features of each class
        between_cls_deviation = X_cls_mean - X.mean(axis=0) # mean of each class - mean of whole data

        within_cls_deviation = np.zeros_like(X)
        for cls_idx, cls in enumerate(classes):
            indices = np.where(y == cls)[0]
            within_cls_deviation[indices] = X[indices] - X_cls_mean[cls_idx]  # subs each sample from its class mean

        Sb = priors * between_cls_deviation.T @ between_cls_deviation # how far are classes to the big mean
        Sw = within_cls_deviation.T @ within_cls_deviation / n_samples # how tight each class is
        evals, evecs = eigh(Sb, Sw)  # eigen values and vectors
        self.dvecs = evecs[:, np.argsort(evals)[::-1]]   # sort

        self.weights = X_cls_mean @ self.dvecs @ self.dvecs.T
        self.bias = np.log(priors) - 0.5 * np.diag(X_cls_mean @ self.weights.T)

        if self.n_components is None:
            self.n_components = min(classes.size - 1, n_features)



    def transform(self, X):
        return X @ self.dvecs[:, : self.n_components]

    def predict(self, X_test):
        scores = X_test @ self.weights.T + self.bias
        return np.argmax(scores, axis=1)

In [6]:
def compute_statistical_features(X_train):
    if len(X_train) == 0:
        print("Error: Empty input data.")
        return None

    mean = np.mean(X_train, axis=1)
    std_dev = np.std(X_train, axis=1)
    median = np.median(X_train, axis=1)
    min= np.min(X_train,axis=1)
    max=np.max(X_train,axis=1)
    features = np.column_stack((mean, std_dev, median,min,max))
    feature_names = ['mean', 'std_dev', 'median','minimum','maximum']

    return pd.DataFrame(features, columns=feature_names)


X_train_features = compute_statistical_features(X_train)

test_features = compute_statistical_features(X_test)

selected_features_train, test_features_selected = pso_feature_selection(X_train_features, y_train_mapped, test_features)



In [7]:
import math
X_train_selected = X_train_features[selected_features_train].values
X_test_selected = test_features_selected.values
lda = LDA(n_components=3)
lda.fit(X_train_selected, y_train_mapped)
X_train_lda = lda.transform(X_train_selected)
X_test_lda = lda.transform(X_test_selected)

model = CustomDecisionTreeClassifier(max_depth=10)
model.fit(X_train_lda, y_train)

y_pred_train = model.predict(X_train_lda)

def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

train_accuracy = accuracy(y_train, y_pred_train)
print(f"Training Accuracy: {train_accuracy:.2f}")

Training Accuracy: 0.84
