In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from scipy.stats import uniform
from scipy.io import loadmat

from sklearn.svm import SVC
from sklearn.metrics import (balanced_accuracy_score, roc_auc_score, accuracy_score,
                             confusion_matrix, roc_curve, mean_squared_error, r2_score)
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# The following lines adjust the granularity of reporting.
pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format


In [3]:


def convert_matrix_to_long_row(matrix):
    # Ensure it's a DataFrame
    if isinstance(matrix, np.ndarray):
        matrix = pd.DataFrame(matrix)

    # Get upper triangle of the matrix (excluding the diagonal)
    upper_triangle = matrix.where(np.triu(np.ones(matrix.shape), k=1).astype(bool))

    # Drop NaN values and get unique values
    unique_correlations = upper_triangle.stack().dropna().unique()

    return pd.DataFrame(unique_correlations).T




In [4]:
def generate_column_headers(matrix):
    rows, cols = matrix.shape
    headers = [f'{i}throw_{j}thcolumn' for i in range(rows) for j in range(cols) if i < j]
    return headers


In [5]:

def process_folder(source_folder, output_file):
    long_rows = []
    all_headers = []
    # Iterate over items in the source folder
    for filename in os.listdir(source_folder):
        file_path = os.path.join(source_folder, filename)
        # Print filename if needed
        # print(filename)

        # Edge cases: check if id is 12 bytes or 11
        len_id = 12  # default
        if filename[4:4+len_id][-1] == "_":
            print(filename[4:4+len_id])
            len_id = 11

        # Check if it is a file (and not a directory) and ends with .tsv
        if os.path.isfile(file_path) and filename.endswith('.tsv'):
            try:
                # Load the matrix
                matrix = pd.read_csv(file_path, sep='\t', header=None)

                # Generate column headers for this matrix
                headers = ['participant_id'] + generate_column_headers(matrix)
                all_headers = headers  # Assume all files have the same matrix shape

                # Convert the matrix to a long row of unique correlations
                long_row = convert_matrix_to_long_row(matrix)

                # Add participant ID
                extracted_id = filename[4:4+len_id]
                long_row.insert(0, "participant_id", extracted_id)

                # Append the long row as a new row in the results
                long_rows.append(long_row)

            except Exception as e:
                print(f"Error processing file {filename}: {e}")
        else:
            print(f"Skipping {filename} (not a file or not a TSV)")

    # Write out the file
    out_df = pd.concat(long_rows)
    out_df.columns = headers
    out_df.to_csv(output_file, sep='\t', header=True, index=False)
    return out_df




In [6]:

# Define source folders and output files (update paths as needed)
source_folder_test = '/content/drive/MyDrive/widsdatathon2025-university/test_tsv/test_tsv'
output_file_test = '/content/drive/MyDrive/widsdatathon2025-university/test_correlations.tsv'

source_folder_train = '/content/drive/MyDrive/widsdatathon2025-university/train_tsv/train_tsv'
output_file_train = '/content/drive/MyDrive/widsdatathon2025-university/train_correlations.tsv'



In [7]:
# Process test and train folders
test_big = process_folder(source_folder_test, output_file_test)




In [8]:

train_big = process_folder(source_folder_train, output_file_train)



NDARVB04TJA_
NDARJ257ZU2_
NDARVB04TJA_


In [9]:
# Checking lengths of the train/test big dataframes to make sure there were no errors in uploads/processing
print("# rows, train: ", len(train_big))  # should be: 1104 rows
print("# rows, test: ", len(test_big))    # should be: 474 rows




# rows, train:  1705
# rows, test:  474


In [10]:
# Read metadata from drive
train_short = pd.read_csv("/content/drive/MyDrive/widsdatathon2025-university/metadata/training_metadata.csv")
test_short  = pd.read_csv("/content/drive/MyDrive/widsdatathon2025-university/metadata/test_metadata.csv")


In [11]:
# Checking the size of our short dataframes
print("train_short shape:", train_short.shape)  # e.g., (1104, 4)
print("test_short shape:", test_short.shape)    # e.g., (474, 4)


train_short shape: (1104, 14)
test_short shape: (474, 13)


In [12]:

# Merge the big correlation data with the metadata
train = pd.merge(train_big, train_short, on='participant_id', how='outer')
test = pd.merge(test_big, test_short, on='participant_id', how='outer')


In [13]:
# Print the first few rows and shapes of the merged dataframes
print(train.head())
print(test.head())
print("Train shape:", train.shape)
print("Test shape:", test.shape)



  participant_id  0throw_1thcolumn  0throw_2thcolumn  0throw_3thcolumn  \
0   NDARAA306NT2               0.5               0.5               0.4   
1   NDARAA504CRN               0.1               0.7               0.7   
2   NDARAA536PTU               0.1               0.5               0.3   
3   NDARAB055BPR               0.3               0.6               0.1   
4   NDARAB458VK9               0.3               0.6               0.7   

   0throw_4thcolumn  0throw_5thcolumn  0throw_6thcolumn  0throw_7thcolumn  \
0               0.4               0.7               0.4               0.5   
1               0.5               0.7               0.6               0.5   
2               0.7               0.7               0.6               0.5   
3               0.2               0.5               0.5               0.7   
4               0.5               0.7               0.7               0.5   

   0throw_8thcolumn  0throw_9thcolumn  ...  bmi               ethnicity  \
0              -0

In [14]:

# Prepare features and targets
# For training features, drop the 'age' column (target)
X_train = train.drop(columns=['age'])
X_test = test.copy()  # test data does not include the target

In [15]:
# Define targets
y_train = train['age']
# Note: y_test is not used for submission; test metadata might have an age column for evaluation locally.
# Here we assume we predict for test and submit the participant IDs and predicted ages.


In [16]:

# Check for NaN values in X_train and X_test
print("NaN values in train:", pd.isna(X_train).sum().sum())
print("NaN values in test:", pd.isna(X_test).sum().sum())


NaN values in train: 2549
NaN values in test: 230


In [17]:
# List of categorical columns to one-hot encode
categorical_cols = ['sex', 'study_site', 'ethnicity', 'race', 'handedness', 'parent_1_education', 'parent_2_education']



In [18]:

# Fill missing values in categorical columns with "Unknown"
X_train.loc[:, categorical_cols] = X_train.loc[:, categorical_cols].fillna("Unknown")
X_test.loc[:, categorical_cols] = X_test.loc[:, categorical_cols].fillna("Unknown")


In [19]:
# Identify numerical columns from X_test (assumed to be same in X_train)
numerical_cols = X_test.select_dtypes(include=[np.number]).columns

# Impute numerical columns with the mean value
imputer_num = SimpleImputer(strategy='mean')
X_train.loc[:, numerical_cols] = imputer_num.fit_transform(X_train[numerical_cols])
X_test.loc[:, numerical_cols] = imputer_num.transform(X_test[numerical_cols])




In [20]:
# Ensure categorical columns are strings
X_train.loc[:, categorical_cols] = X_train.loc[:, categorical_cols].astype(str)
X_test.loc[:, categorical_cols] = X_test.loc[:, categorical_cols].astype(str)



In [21]:
# Apply One-Hot Encoding to the categorical columns
encoder = OneHotEncoder(drop='first', handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train[categorical_cols])
X_test_encoded = encoder.transform(X_test[categorical_cols])

# Convert encoded columns into DataFrames
X_train_encoded_df = pd.DataFrame(X_train_encoded.todense(),
                                  columns=encoder.get_feature_names_out(categorical_cols))
X_test_encoded_df = pd.DataFrame(X_test_encoded.todense(),
                                 columns=encoder.get_feature_names_out(categorical_cols))


In [22]:
# Drop original categorical columns from X_train and X_test
X_train_numerical = X_train.drop(categorical_cols, axis=1).reset_index(drop=True)
X_test_numerical = X_test.drop(categorical_cols, axis=1).reset_index(drop=True)


In [23]:
# Combine the encoded categorical columns with the numerical columns
X_train_final = pd.concat([X_train_encoded_df, X_train_numerical], axis=1)
X_test_final = pd.concat([X_test_encoded_df, X_test_numerical], axis=1)


In [24]:
# Exclude any remaining non-numeric columns (if any)
X_train_final = X_train_final.select_dtypes(exclude=['object'])
X_test_final = X_test_final.select_dtypes(exclude=['object'])


In [25]:
# Ensure X_train_final aligns with y_train by reindexing if necessary
X_train_final = X_train_final.loc[y_train.index]


In [26]:
# -------------------------------
# MODELING: RIDGE REGRESSION WITH HYPERPARAMETER TUNING
# -------------------------------


In [27]:


# Set up the Ridge regressor
ridge = Ridge(random_state=42)

# Define the parameter distribution for alpha
param_dist = {'alpha': uniform(loc=0.01, scale=100)}

# Set up 5-fold cross-validation within RandomizedSearchCV
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=ridge,
                                   param_distributions=param_dist,
                                   n_iter=50,
                                   cv=cv,
                                   scoring='neg_mean_squared_error',  # Using negative MSE as the score
                                   random_state=42,
                                   n_jobs=-1)

# Fit the hyperparameter search on the training data
random_search.fit(X_train_final, y_train)

# Print the best hyperparameters and best cross-validation score
print("Best Hyperparameters:", random_search.best_params_)
best_cv_score = -random_search.best_score_  # Convert to positive MSE
print("Best CV Mean Squared Error: {:.2f}".format(best_cv_score))

# Retrieve the best model from the search
best_model = random_search.best_estimator_

# 9mins


Best Hyperparameters: {'alpha': 36.64618432936917}
Best CV Mean Squared Error: 1.89


In [28]:
# Evaluate the model on the full training set
y_train_pred = best_model.predict(X_train_final)
train_mse = mean_squared_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)
print("Training MSE: {:.2f}".format(train_mse))
print("Training R^2 Score: {:.2f}".format(train_r2))



Training MSE: 0.02
Training R^2 Score: 1.00


In [29]:
# Make predictions on the test set
y_pred = best_model.predict(X_test_final)

# Ensure predictions are non-negative (age is always positive)
y_pred = np.clip(y_pred, a_min=0, a_max=None)



In [30]:


# For demonstration, print participant IDs with their predicted age from the test set
for j, pred in enumerate(y_pred):
    print(test['participant_id'].iloc[j], pred)

# -------------------------------
# CREATE SUBMISSION FILE
# -------------------------------



NDARAA075AMK 8.252824570146142
NDARAA948VFH 9.143727250022382
NDARAC462DZH 7.318647908892123
NDARAC853CR6 6.254633210115355
NDARAD703XA2 9.371603129051241
NDARAE199TDD 6.992844483141456
NDARAF535XK6 13.819197157677596
NDARAG340ERT 12.66872757364018
NDARAH304ED7 10.322825277320598
NDARAH503YG1 8.89206219327546
NDARAH793FBF 5.519384712577361
NDARAJ366ZFA 9.594494802075324
NDARAL897CYV 11.634473769404174
NDARAM357BUA 11.921254887831815
NDARAP049KXJ 8.4764585490415
NDARAP785CTE 9.937376497165747
NDARAR025WX4 9.858642227005348
NDARAR238RZ8 10.7624685231956
NDARAT299YRR 9.4799703923181
NDARAV187GJ5 10.291865040666943
NDARAV894XWD 12.751846330986112
NDARAW179AYF 10.2581558390566
NDARAX283MAK 10.68002977330296
NDARAY298THW 9.046100656061935
NDARBD328NUQ 10.334422376119935
NDARBF042LDM 9.638730770593831
NDARBF183RFB 12.380070150638598
NDARBG574KF4 8.663185239306266
NDARBJ375VP4 9.666096194201023
NDARBK106KRH 11.621212515456111
NDARBL242L4H 10.681352617675413
NDARBN365EV3 12.22595011761192
NDARB

In [34]:
# Create submission DataFrame with two columns: participant_id and age
submission = pd.DataFrame({
    'participant_id': test['participant_id'],  # Use the ID column from the test DataFrame
    'age': np.round(y_pred, 2)  # Round age predictions to 2 decimal places
})

In [35]:
# Ensure participant IDs are stored as strings
submission['participant_id'] = submission['participant_id'].astype(str)


In [36]:
# Save the submission file to CSV
submission_file = '/content/drive/MyDrive/widsdatathon2025-university/kagglekonquest_ridge_regression_with_two_cols.csv'
submission.to_csv(submission_file, index=False)
print(f"Submission saved to {submission_file}")

Submission saved to /content/drive/MyDrive/widsdatathon2025-university/kagglekonquest_ridge_regression_with_two_cols.csv
