In [38]:
import pandas as pd
import numpy as np
import mafese
from sklearn.preprocessing import StandardScaler
from mafese.wrapper.mha import MhaSelector
from mafese import get_dataset
from sklearn.svm import SVC
from scipy import sparse
from datetime import datetime
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score
from imblearn.over_sampling import SMOTE




In [39]:
data_array = pd.read_csv("combined_data.csv", index_col=0)

In [40]:
data_array = data_array[data_array["Grand Final Place"] != 27]


In [41]:
categorical_columns = ['Country Name', 'Country Code', 'Year', 'Song', 'Artist', 'Language']
numeric_cols = data_array.columns.difference(categorical_columns)

In [42]:
data_array[numeric_cols] = data_array[numeric_cols].fillna(data_array[numeric_cols].median())


In [None]:
empty_columns = data_array.columns[data_array.isnull().all()]
data_array = data_array.drop(columns=empty_columns)
data_array

In [None]:
data_array["Grand Final Place"].value_counts()

In [45]:
data_encoded = pd.get_dummies(data_array, columns=categorical_columns, drop_first=True)


In [46]:
# data_array = data_array.sample(500)
target = data_encoded["Grand Final Place"].values
non_targets = data_encoded.drop(columns=["Grand Final Place"]).values


In [47]:
data = mafese.Data(non_targets, target)


In [48]:
data.split_train_test(test_size=0.2, inplace=True)

In [49]:
X_train_df = pd.DataFrame(data.X_train)
X_test_df = pd.DataFrame(data.X_test)

In [50]:
standard_scaler = StandardScaler()

data.X_train = standard_scaler.fit_transform(data.X_train)
data.X_test = standard_scaler.transform(data.X_test)


In [51]:
data.y_train, scaler_y = data.encode_label(data.y_train)
data.y_test = scaler_y.transform(data.y_test)

In [52]:
feat_selector = MhaSelector()

In [53]:
weights = (0.9, 0.1)

In [None]:
feat_selector.fit(data.X_train, data.y_train, fit_weights=weights, verbose=True)

In [None]:
print(len(feat_selector.selected_feature_indexes))

In [None]:
print(feat_selector.selected_feature_indexes)


In [57]:
X_train_selected = feat_selector.transform(data.X_train)
X_test_selected = feat_selector.transform(data.X_test)

In [None]:
unique_classes, class_counts = np.unique(data.y_train, return_counts=True)
print("Unique classes in y_train:", unique_classes)
print("Counts of each class in y_train:", class_counts)

In [59]:
param_grid = {
    'C': [0.1, 1, 10, 100],      # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],  # Kernel types
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],  # Kernel coefficient
}

In [35]:
grid_search = GridSearchCV(SVC(), param_grid, cv=2, verbose=2, scoring='accuracy')


In [60]:
grid_search.fit(X_train_selected, data.y_train)


In [None]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

In [None]:
print("Best parameters found: ", best_params)
print("Best cross-validation score: ", best_score)


In [None]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_selected)

In [None]:
test_accuracy = accuracy_score(data.y_test, y_pred)
test_precision = precision_score(data.y_test, y_pred, average='weighted')
test_recall = recall_score(data.y_test, y_pred, average='weighted')

print("Test Accuracy: ", test_accuracy)
print("Test Precision: ", test_precision)
print("Test Recall: ", test_recall)


In [186]:
results = feat_selector.evaluate(estimator=SVC(), data=data, metrics=["AS", "PS", "RS"])


In [None]:
for key in results.keys():
    print(f"{key}: {results[key]}")


In [188]:
output = f"""
Run at: {datetime.now()}
Weights = {weights}
Outputs:
"""
for key in results.keys():
    output += f"\t{key}: {results[key]}\n"

output += """
Best parameters: {best_params}
Best cross-validation score: {best_score}
Test Accuracy: {test_accuracy}
Test Precision: {test_precision}
Test Recall: {test_recall}
"""
with open("outputs/outputs.txt", "a", encoding="UTF-8") as f:
    f.write(output)