In [414]:
import pandas as pd
from scipy.sparse import csr_matrix 
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import numpy as np

In [415]:
data = pd.read_csv('train.txt', header = None)

In [416]:
features = []
class_labels = []
data1 = []
row_indices = []
col_indices = []

In [417]:
for i in range(0, data.shape[0]):
    data_point = list(filter(lambda x: x.strip() != "", data.iloc[i][0].replace("\t", " ").split(" ")))
    data_point = list(map(int, data_point))
    class_labels.append(data_point[0])
    features.append(data_point[1:])

In [418]:
for i, indices in enumerate(features):
    data1.extend([1] * len(indices)) 
    row_indices.extend([i] * len(indices))
    col_indices.extend(indices)

num_rows = len(features)
num_columns = max(max(indices) for indices in features) + 1


In [419]:
sparse_matrix = csr_matrix((data1, (row_indices, col_indices)), shape=(num_rows, num_columns))

# Tried Over Sampling and Under Sampling

In [420]:
# oversampler = RandomOverSampler(sampling_strategy='minority')
# X_resampled, y_resampled = oversampler.fit_resample(sparse_matrix, class_labels)
# smote = SMOTE()
# X_resampled, y_resampled = smote.fit_resample(sparse_matrix, class_labels)
# undersampler = RandomUnderSampler(sampling_strategy='majority')
# X_resampled, y_resampled = undersampler.fit_resample(sparse_matrix, class_labels)

# over = SMOTE(sampling_strategy=0.5)
# under = RandomUnderSampler(sampling_strategy=0.5)
# steps = [('o', over), ('u', under)]
# pipeline = Pipeline(steps=steps)
# # transform the dataset
# X, y = pipeline.fit_resample(sparse_matrix, class_labels)

In [421]:
X_train, X_test, y_train, y_test = train_test_split(sparse_matrix, class_labels, test_size=0.2, random_state=42, stratify=class_labels)

In [422]:
k_best = SelectKBest(score_func=chi2, k=350)  

X_train_new = k_best.fit_transform(X_train, y_train)
X_test_new = k_best.transform(X_test)


# KNN

In [423]:

neigh = KNeighborsClassifier(n_neighbors=20)
neigh.fit(X_train_new, y_train)
y_pred = neigh.predict(X_test_new)
accuracy = f1_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.4


# Naive Bayes

In [424]:
clf = GaussianNB()


clf.fit(X_train_new.toarray(), y_train)


y_pred = clf.predict(X_test_new.toarray())


f1 = f1_score(y_test, y_pred)

print(f"F1 Score: {f1:.2f}")

F1 Score: 0.55


# Desicion Tree

In [425]:
dt_classifier = DecisionTreeClassifier(random_state=42)


dt_classifier.fit(X_train_new, y_train)


y_pred = dt_classifier.predict(X_test_new)


f1 = f1_score(y_test, y_pred, average='macro')
print(f"F1 Score: {f1}")

F1 Score: 0.7775249652382759


# MLP with bucketing technique

In [426]:
active_indices = np.where(np.array(y_train) == 1)[0]
inactive_indices = np.where(np.array(y_train) == 0)[0]
y_train = np.array(y_train)

all_predictions = []
used_inactive_indices = []


num_models = 10  
for i in range(num_models):
    
    num_inactive_samples = min(150, len(inactive_indices))

    
    unused_inactive_indices = list(set(inactive_indices) - set(used_inactive_indices))

    
    if len(unused_inactive_indices) < num_inactive_samples:
        unused_inactive_indices = list(set(inactive_indices))

    random_inactive_indices = np.random.choice(unused_inactive_indices, num_inactive_samples, replace=False)

   
    used_inactive_indices.extend(random_inactive_indices)

    
    balanced_indices = np.concatenate([active_indices, random_inactive_indices])


    
    X_balanced = X_train[balanced_indices]
    y_balanced = y_train[balanced_indices]
    X_train_new = k_best.fit_transform(X_balanced, y_balanced)
    
  
    model = MLPClassifier(hidden_layer_sizes=(64, 32), alpha=1e-5, activation='relu', solver='adam', random_state=42)
 
    model.fit(X_train_new, y_balanced)
    X_test_new = k_best.transform(X_test)
    
    y_pred = model.predict(X_test_new)

   
    all_predictions.append(y_pred)




In [427]:
y_pred = np.any(all_predictions, axis=0).astype(int)
print(y_pred)

[0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0
 0 0 0 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0
 1 0 0 0 1 0 1 1 0 0 0 1]


In [428]:

f1 = f1_score(y_test, y_pred, average='macro')
print(f"F1 Score: {f1}")

F1 Score: 0.7207235798159315


# Testing

In [383]:
data_test = pd.read_csv('test.txt', header = None)

In [384]:
test_features = []
test_data = []
test_row_indices = []
test_col_indices = []

In [385]:
for i in range(0, data_test.shape[0]):
    data_point = list(filter(lambda x: x.strip() != "", data_test.iloc[i][0].replace("\t", " ").split(" ")))
    data_point = list(map(int, data_point))
    test_features.append(data_point)

In [386]:
for i, indices in enumerate(test_features):
    test_data.extend([1] * len(indices))  # Assuming all values are 1
    test_row_indices.extend([i] * len(indices))
    test_col_indices.extend(indices)


num_rows = len(test_features)
num_columns = max(max(indices) for indices in features) + 1


In [387]:
test_sparse_matrix = csr_matrix((test_data, (test_row_indices, test_col_indices)), shape=(num_rows, num_columns))

In [389]:
active_indices = np.where(np.array(y_train) == 1)[0]
inactive_indices = np.where(np.array(y_train) == 0)[0]
y_train = np.array(y_train)

all_predictions = []
used_inactive_indices = []


num_models = 10  
for i in range(num_models):
    
    num_inactive_samples = min(150, len(inactive_indices))

    
    unused_inactive_indices = list(set(inactive_indices) - set(used_inactive_indices))

    
    if len(unused_inactive_indices) < num_inactive_samples:
        unused_inactive_indices = list(set(inactive_indices))

    random_inactive_indices = np.random.choice(unused_inactive_indices, num_inactive_samples, replace=False)

 
    used_inactive_indices.extend(random_inactive_indices)

 
    balanced_indices = np.concatenate([active_indices, random_inactive_indices])


   
    X_balanced = X_train[balanced_indices]
    y_balanced = y_train[balanced_indices]
    X_train_new = k_best.fit_transform(X_balanced, y_balanced)
    
    model = mlp_classifier = MLPClassifier(hidden_layer_sizes=(64, 32), alpha=1e-5, activation='relu', solver='adam', random_state=42)

    model.fit(X_train_new, y_balanced)
    X_test_set = k_best.transform(test_sparse_matrix)
    
    y_pred = model.predict(X_test_set)

    
    all_predictions.append(y_pred)


In [390]:
y_test_pred = np.logical_and.reduce(all_predictions).astype(int)


In [392]:
file_name = "formatfile.txt"


with open(file_name, 'w') as file:
    for prediction in y_test_pred:
        file.write(str(prediction) + '\n')

In [391]:
print(len(y_test_pred))

350
