In [13]:
import numpy as np
import pandas as pd
import seaborn as sns

import os
import matplotlib.pyplot as plt

import sklearn
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, accuracy_score, confusion_matrix, roc_curve
from scipy.stats import zscore, pearsonr, uniform
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV

from scipy.io import loadmat

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

In [15]:
# Loading in old (unrevised) dataframes
file_path_trainC = "/Users/mayapatel/UCLA_WiDs_Team-21/data/TRAIN/TRAIN_CATEGORICAL_METADATA.xlsx"
train_cat = pd.read_excel(file_path_trainC)

file_path_trainFCM = "/Users/mayapatel/UCLA_WiDs_Team-21/data/TRAIN/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv"
train_FCM = pd.read_csv(file_path_trainFCM)

file_path_trainQ = "/Users/mayapatel/UCLA_WiDs_Team-21/data/TRAIN/TRAIN_QUANTITATIVE_METADATA.xlsx"
train_Quant = pd.read_excel(file_path_trainQ)

file_path_trainS = file_path_trainQ = "/Users/mayapatel/UCLA_WiDs_Team-21/data/TRAIN/TRAINING_SOLUTIONS.xlsx"
train_Solutions = pd.read_excel(file_path_trainS)

## Preprocessing

In [17]:
# Set up for one hot encoding
for col in train_cat.select_dtypes(include='int').columns:
    train_cat[col] = train_cat[col].astype('category')

In [18]:
# Creating a list of all of the columns except the first
columns_to_encode = train_cat.columns[1:].tolist()

# Print the columns to encode
print("Columns to encode:", columns_to_encode)

Columns to encode: ['Basic_Demos_Enroll_Year', 'Basic_Demos_Study_Site', 'PreInt_Demos_Fam_Child_Ethnicity', 'PreInt_Demos_Fam_Child_Race', 'MRI_Track_Scan_Location', 'Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Edu', 'Barratt_Barratt_P2_Occ']


In [19]:
# encoding categorical data
train_encoded = pd.get_dummies(train_cat[columns_to_encode], drop_first=True)
train_encoded = train_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))

  train_encoded = train_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))


In [20]:
print(train_cat.columns)

Index(['participant_id', 'Basic_Demos_Enroll_Year', 'Basic_Demos_Study_Site',
       'PreInt_Demos_Fam_Child_Ethnicity', 'PreInt_Demos_Fam_Child_Race',
       'MRI_Track_Scan_Location', 'Barratt_Barratt_P1_Edu',
       'Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Edu',
       'Barratt_Barratt_P2_Occ'],
      dtype='object')


In [21]:
columns_to_encode

['Basic_Demos_Enroll_Year',
 'Basic_Demos_Study_Site',
 'PreInt_Demos_Fam_Child_Ethnicity',
 'PreInt_Demos_Fam_Child_Race',
 'MRI_Track_Scan_Location',
 'Barratt_Barratt_P1_Edu',
 'Barratt_Barratt_P1_Occ',
 'Barratt_Barratt_P2_Edu',
 'Barratt_Barratt_P2_Occ']

In [22]:
# Combine encoded columns with the rest of the DataFrame
cat_train_final = pd.concat([train_cat.drop(columns=columns_to_encode), train_encoded], axis=1)

# ensure it looks correct
cat_train_final.head()

Unnamed: 0,participant_id,PreInt_Demos_Fam_Child_Ethnicity,Basic_Demos_Enroll_Year_2016,Basic_Demos_Enroll_Year_2017,Basic_Demos_Enroll_Year_2018,Basic_Demos_Enroll_Year_2019,Basic_Demos_Enroll_Year_2020,Basic_Demos_Study_Site_2,Basic_Demos_Study_Site_3,Basic_Demos_Study_Site_4,...,Barratt_Barratt_P2_Edu_21,Barratt_Barratt_P2_Occ_5,Barratt_Barratt_P2_Occ_10,Barratt_Barratt_P2_Occ_15,Barratt_Barratt_P2_Occ_20,Barratt_Barratt_P2_Occ_25,Barratt_Barratt_P2_Occ_30,Barratt_Barratt_P2_Occ_35,Barratt_Barratt_P2_Occ_40,Barratt_Barratt_P2_Occ_45
0,UmrK0vMLopoR,0.0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
1,CPaeQkhcjg7d,1.0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,Nb4EetVPm3gs,1.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,p4vPhVu91o4b,0.0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,M09PXs7arQ5E,0.0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


### Train and Test Dataframes

In [24]:
# load in test categorical dataframe

file_path_testC = "/Users/mayapatel/UCLA_WiDs_Team-21/data/TEST/TEST_CATEGORICAL.xlsx"
test_cat = pd.read_excel(file_path_testC)
test_cat.head()

Unnamed: 0,participant_id,Basic_Demos_Enroll_Year,Basic_Demos_Study_Site,PreInt_Demos_Fam_Child_Ethnicity,PreInt_Demos_Fam_Child_Race,MRI_Track_Scan_Location,Barratt_Barratt_P1_Edu,Barratt_Barratt_P1_Occ,Barratt_Barratt_P2_Edu,Barratt_Barratt_P2_Occ
0,Cfwaf5FX7jWK,2022,4,0.0,0.0,4,21.0,30.0,18.0,30.0
1,vhGrzmvA3Hjq,2023,4,0.0,0.0,4,21.0,45.0,,30.0
2,ULliyEXjy4OV,2022,4,0.0,0.0,4,21.0,40.0,18.0,40.0
3,LZfeAb1xMtql,2022,4,0.0,0.0,3,21.0,45.0,21.0,45.0
4,EnFOUv0YK1RG,2022,4,2.0,0.0,4,18.0,0.0,21.0,45.0


In [25]:
# convert our int variables to categories
for col in test_cat.select_dtypes(include='int').columns:
    test_cat[col] = test_cat[col].astype('category')

# Encode categorical variables in test
test_encoded = pd.get_dummies(test_cat[columns_to_encode], drop_first=True)
test_encoded = test_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))

# Ensure test_encoded has the same columns as train_encoded
missing_cols = set(train_encoded.columns) - set(test_encoded.columns)
for col in missing_cols:
    test_encoded[col] = 0  # Add missing columns with 0 values

# Ensure test_encoded columns are in the same order as train_encoded
test_encoded = test_encoded.reindex(columns=train_encoded.columns, fill_value=0)

# Combine encoded columns with the rest of the DataFrame
cat_test_final = pd.concat([test_cat.drop(columns=columns_to_encode), test_encoded], axis=1)

cat_test_final.head()

  test_encoded = test_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))


Unnamed: 0,participant_id,PreInt_Demos_Fam_Child_Ethnicity,Basic_Demos_Enroll_Year_2016,Basic_Demos_Enroll_Year_2017,Basic_Demos_Enroll_Year_2018,Basic_Demos_Enroll_Year_2019,Basic_Demos_Enroll_Year_2020,Basic_Demos_Study_Site_2,Basic_Demos_Study_Site_3,Basic_Demos_Study_Site_4,...,Barratt_Barratt_P2_Edu_21,Barratt_Barratt_P2_Occ_5,Barratt_Barratt_P2_Occ_10,Barratt_Barratt_P2_Occ_15,Barratt_Barratt_P2_Occ_20,Barratt_Barratt_P2_Occ_25,Barratt_Barratt_P2_Occ_30,Barratt_Barratt_P2_Occ_35,Barratt_Barratt_P2_Occ_40,Barratt_Barratt_P2_Occ_45
0,Cfwaf5FX7jWK,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,vhGrzmvA3Hjq,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ULliyEXjy4OV,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,LZfeAb1xMtql,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,EnFOUv0YK1RG,2.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Merging Dataframes

In [27]:
train_cat_FCM = pd.merge(cat_train_final, train_FCM, on = 'participant_id')

In [28]:
train_df = pd.merge(train_cat_FCM, train_Quant, on = 'participant_id')

train_df.head()

Unnamed: 0,participant_id,PreInt_Demos_Fam_Child_Ethnicity,Basic_Demos_Enroll_Year_2016,Basic_Demos_Enroll_Year_2017,Basic_Demos_Enroll_Year_2018,Basic_Demos_Enroll_Year_2019,Basic_Demos_Enroll_Year_2020,Basic_Demos_Study_Site_2,Basic_Demos_Study_Site_3,Basic_Demos_Study_Site_4,...,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,UmrK0vMLopoR,0.0,1,0,0,0,0,0,0,0,...,0,6,1,5,0,5,1,0,10,
1,CPaeQkhcjg7d,1.0,0,0,0,1,0,0,1,0,...,0,18,6,8,7,8,10,4,5,
2,Nb4EetVPm3gs,1.0,1,0,0,0,0,0,0,0,...,1,14,2,8,5,7,6,4,9,8.239904
3,p4vPhVu91o4b,0.0,0,0,1,0,0,0,1,0,...,6,24,4,16,9,10,8,4,6,
4,M09PXs7arQ5E,0.0,0,0,0,1,0,0,1,0,...,1,18,4,11,4,10,7,3,9,8.940679


**Merge Test Dataframes**

In [31]:
file_path_testFCM = "/Users/mayapatel/UCLA_WiDs_Team-21/data/TEST/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv"
test_FCM = pd.read_csv(file_path_testFCM)

file_path_testQ = "/Users/mayapatel/UCLA_WiDs_Team-21/data/TEST/TEST_QUANTITATIVE_METADATA.xlsx"
test_Quant = pd.read_excel(file_path_testQ)

test_cat_FCM = pd.merge(cat_test_final, test_FCM, on = 'participant_id')

test_df = pd.merge(test_cat_FCM, test_Quant, on = 'participant_id')

# ensure it looks accurate
test_df.head()

Unnamed: 0,participant_id,PreInt_Demos_Fam_Child_Ethnicity,Basic_Demos_Enroll_Year_2016,Basic_Demos_Enroll_Year_2017,Basic_Demos_Enroll_Year_2018,Basic_Demos_Enroll_Year_2019,Basic_Demos_Enroll_Year_2020,Basic_Demos_Study_Site_2,Basic_Demos_Study_Site_3,Basic_Demos_Study_Site_4,...,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,Cfwaf5FX7jWK,0.0,0,0,0,0,0,0,0,0,...,2.0,12.0,3.0,9.0,2.0,7.0,3.0,0.0,8.0,8.992813
1,vhGrzmvA3Hjq,0.0,0,0,0,0,0,0,0,0,...,2.0,16.0,8.0,5.0,7.0,3.0,11.0,3.0,9.0,12.324093
2,ULliyEXjy4OV,0.0,0,0,0,0,0,0,0,0,...,1.0,7.0,1.0,6.0,1.0,5.0,1.0,0.0,9.0,7.770933
3,LZfeAb1xMtql,0.0,0,0,0,0,0,0,0,0,...,4.0,15.0,4.0,10.0,8.0,6.0,5.0,1.0,6.0,9.304814
4,EnFOUv0YK1RG,2.0,0,0,0,0,0,0,0,0,...,2.0,18.0,6.0,12.0,5.0,10.0,6.0,0.0,10.0,8.26135


In [32]:
len(test_df.columns)

19973

### Handling NA Values

In [42]:
# check how many NA values we have
print(train_df.isna().sum())

participant_id                        0
PreInt_Demos_Fam_Child_Ethnicity     11
Basic_Demos_Enroll_Year_2016          0
Basic_Demos_Enroll_Year_2017          0
Basic_Demos_Enroll_Year_2018          0
                                   ... 
SDQ_SDQ_Hyperactivity                 0
SDQ_SDQ_Internalizing                 0
SDQ_SDQ_Peer_Problems                 0
SDQ_SDQ_Prosocial                     0
MRI_Track_Age_at_Scan               360
Length: 19973, dtype: int64


**First, we will do it for the PreInt_Demos_Fam_Child_Ethnicity column**

In [44]:
# Accessing all rows where PreInt_Demos_Fam_Child_Ethnicity is null, will predict with this data
rows_with_null_PiDFCE = train_df[train_df['PreInt_Demos_Fam_Child_Ethnicity'].isna()]

test_rows_with_null_PiDFCE = test_df[test_df['PreInt_Demos_Fam_Child_Ethnicity'].isna()]

# Filling in null values of other cols with mean from full training data
rows_with_null_PiDFCE.fillna({'MRI_Track_Age_at_Scan':train_df['MRI_Track_Age_at_Scan'].mean()}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rows_with_null_PiDFCE.fillna({'MRI_Track_Age_at_Scan':train_df['MRI_Track_Age_at_Scan'].mean()}, inplace = True)


In [46]:
# Filling the testing data null values, besides PreInt_Demos_Fam_Child_Ethnicity column
for col in test_rows_with_null_PiDFCE.columns:
    if col != 'PreInt_Demos_Fam_Child_Ethnicity' and test_rows_with_null_PiDFCE[col].isna().sum() > 0:  # Skip PreInt_Demos_Fam_Child_Ethnicity and check if column has NaN values
        if test_rows_with_null_PiDFCE[col].dtype in ['float64', 'int64']:  # Ensure it's numeric
            test_rows_with_null_PiDFCE[col] = test_rows_with_null_PiDFCE[col].fillna(test_df[col].mean())  # Avoid inplace
        else:
            print(f"Skipping non-numeric column: {col}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_rows_with_null_PiDFCE[col] = test_rows_with_null_PiDFCE[col].fillna(test_df[col].mean())  # Avoid inplace


In [48]:
test_rows_with_null_PiDFCE.isna().sum()

participant_id                      0
PreInt_Demos_Fam_Child_Ethnicity    3
Basic_Demos_Enroll_Year_2016        0
Basic_Demos_Enroll_Year_2017        0
Basic_Demos_Enroll_Year_2018        0
                                   ..
SDQ_SDQ_Hyperactivity               0
SDQ_SDQ_Internalizing               0
SDQ_SDQ_Peer_Problems               0
SDQ_SDQ_Prosocial                   0
MRI_Track_Age_at_Scan               0
Length: 19973, dtype: int64

In [49]:
# Creating training dataset to predict null values

# Dataframe without null values in PiDFCE column
train_PiDFCE_df = train_df[~train_df.index.isin(rows_with_null_PiDFCE.index)]

# Fill in other null column with mean
train_PiDFCE_df.fillna({'MRI_Track_Age_at_Scan':train_df['MRI_Track_Age_at_Scan'].mean()}, inplace = True)

# Xtrain = all columns besides train d
# Ytrain = pidfce from train_df.drop rows with null pidfce

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_PiDFCE_df.fillna({'MRI_Track_Age_at_Scan':train_df['MRI_Track_Age_at_Scan'].mean()}, inplace = True)


In [50]:
train_PiDFCE_df.head()

Unnamed: 0,participant_id,PreInt_Demos_Fam_Child_Ethnicity,Basic_Demos_Enroll_Year_2016,Basic_Demos_Enroll_Year_2017,Basic_Demos_Enroll_Year_2018,Basic_Demos_Enroll_Year_2019,Basic_Demos_Enroll_Year_2020,Basic_Demos_Study_Site_2,Basic_Demos_Study_Site_3,Basic_Demos_Study_Site_4,...,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
0,UmrK0vMLopoR,0.0,1,0,0,0,0,0,0,0,...,0,6,1,5,0,5,1,0,10,11.245678
1,CPaeQkhcjg7d,1.0,0,0,0,1,0,0,1,0,...,0,18,6,8,7,8,10,4,5,11.245678
2,Nb4EetVPm3gs,1.0,1,0,0,0,0,0,0,0,...,1,14,2,8,5,7,6,4,9,8.239904
3,p4vPhVu91o4b,0.0,0,0,1,0,0,0,1,0,...,6,24,4,16,9,10,8,4,6,11.245678
4,M09PXs7arQ5E,0.0,0,0,0,1,0,0,1,0,...,1,18,4,11,4,10,7,3,9,8.940679


In [51]:
train_PiDFCE_df.shape

(1202, 19973)

In [52]:
#Takes 9-10 minutes to run this cell
# Calculate the correlation matrix
# correlation_matrix = train_PiDFCE_df.drop('participant_id', axis=1).corr()

# # Extract the correlation values for column 'a'
# correlations = correlation_matrix['PreInt_Demos_Fam_Child_Ethnicity']

# # Step 3: Sort the correlations and pick the top 10, excluding column 'a' itself
# top_10_columns = correlations.drop('PreInt_Demos_Fam_Child_Ethnicity').abs().sort_values(ascending=False).head(10)

# print(top_10_columns)

PreInt_Demos_Fam_Child_Race_2 0.293147\
PreInt_Demos_Fam_Child_Race_10 0.258507\
Barratt_Barratt_P2_Occ_45 0.151849\
PreInt_Demos_Fam_Child_Race_8 0.147055\
Barratt_Barratt_P2_Edu_21 0.145573\
43throw_181thcolumn 0.132470\
36throw_51thcolumn 0.122027\
68throw_110thcolumn 0.120913\
158throw_189thcolumn 0.113612\
30throw_192thcolumn 0.111799

In [54]:
cols_to_include = ['PreInt_Demos_Fam_Child_Race_2', 'PreInt_Demos_Fam_Child_Race_10', 'Barratt_Barratt_P2_Occ_45']

In [55]:
train_PiDFCE_df[cols_to_include].head()

Unnamed: 0,PreInt_Demos_Fam_Child_Race_2,PreInt_Demos_Fam_Child_Race_10,Barratt_Barratt_P2_Occ_45
0,0,0,1
1,1,0,0
2,0,0,0
3,0,0,0
4,0,0,0


In [56]:
X = train_PiDFCE_df[cols_to_include]
y = train_PiDFCE_df['PreInt_Demos_Fam_Child_Ethnicity']

In [57]:
print(X.shape)
print(y.shape)

(1202, 3)
(1202,)


In [58]:
# KNN Model

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

f1_macro = f1_score(y_test, y_pred, average='macro')
f1_weighted = f1_score(y_test, y_pred, average='weighted')
f1_micro = f1_score(y_test, y_pred, average='micro')

print(f"Macro F1 Score: {f1_macro}")
print(f"Weighted F1 Score: {f1_weighted}")
print(f"Micro F1 Score: {f1_micro}")


new_PiDFCE_vals = knn.predict(rows_with_null_PiDFCE[cols_to_include])
new_test_PiDFCE_vals = knn.predict(test_rows_with_null_PiDFCE[cols_to_include])

Accuracy: 0.79
Macro F1 Score: 0.4134834584375868
Weighted F1 Score: 0.7451386466253426
Micro F1 Score: 0.7867036011080333


In [59]:
new_test_PiDFCE_vals

array([0., 0., 0.])

In [62]:
# Fill in null values:
train_df.loc[train_df['PreInt_Demos_Fam_Child_Ethnicity'].isna(), 'PreInt_Demos_Fam_Child_Ethnicity'] = new_PiDFCE_vals
print(train_df.isna().sum())

test_df.loc[test_df['PreInt_Demos_Fam_Child_Ethnicity'].isna(), 'PreInt_Demos_Fam_Child_Ethnicity'] = new_test_PiDFCE_vals
print(test_df.isna().sum())

participant_id                        0
PreInt_Demos_Fam_Child_Ethnicity      0
Basic_Demos_Enroll_Year_2016          0
Basic_Demos_Enroll_Year_2017          0
Basic_Demos_Enroll_Year_2018          0
                                   ... 
SDQ_SDQ_Hyperactivity                 0
SDQ_SDQ_Internalizing                 0
SDQ_SDQ_Peer_Problems                 0
SDQ_SDQ_Prosocial                     0
MRI_Track_Age_at_Scan               360
Length: 19973, dtype: int64
participant_id                       0
PreInt_Demos_Fam_Child_Ethnicity     0
Basic_Demos_Enroll_Year_2016         0
Basic_Demos_Enroll_Year_2017         0
Basic_Demos_Enroll_Year_2018         0
                                    ..
SDQ_SDQ_Hyperactivity               30
SDQ_SDQ_Internalizing               30
SDQ_SDQ_Peer_Problems               30
SDQ_SDQ_Prosocial                   30
MRI_Track_Age_at_Scan                0
Length: 19973, dtype: int64


**Now we will do it for the MRI_Track_Age_at_Scan column**

In [64]:
# Accessing all rows where MRI_Track_Age_at_Scan is null, will predict with this data
rows_with_null_MRITAS = train_df[train_df['MRI_Track_Age_at_Scan'].isna()]

# Filling in null values of other cols with mean from full training data
rows_with_null_MRITAS.fillna({'PreInt_Demos_Fam_Child_Ethnicity':train_df['PreInt_Demos_Fam_Child_Ethnicity'].mean()}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rows_with_null_MRITAS.fillna({'PreInt_Demos_Fam_Child_Ethnicity':train_df['PreInt_Demos_Fam_Child_Ethnicity'].mean()}, inplace = True)


In [65]:
# Creating training dataset to predict null values

# Dataframe without null values in PiDFCE column
train_MRITAS_df = train_df[~train_df.index.isin(rows_with_null_MRITAS.index)]

# Fill in other null column with mean
train_MRITAS_df.fillna({'PreInt_Demos_Fam_Child_Ethnicity':train_df['PreInt_Demos_Fam_Child_Ethnicity'].mean()}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_MRITAS_df.fillna({'PreInt_Demos_Fam_Child_Ethnicity':train_df['PreInt_Demos_Fam_Child_Ethnicity'].mean()}, inplace = True)


In [66]:
train_MRITAS_df.head()

Unnamed: 0,participant_id,PreInt_Demos_Fam_Child_Ethnicity,Basic_Demos_Enroll_Year_2016,Basic_Demos_Enroll_Year_2017,Basic_Demos_Enroll_Year_2018,Basic_Demos_Enroll_Year_2019,Basic_Demos_Enroll_Year_2020,Basic_Demos_Study_Site_2,Basic_Demos_Study_Site_3,Basic_Demos_Study_Site_4,...,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,SDQ_SDQ_Externalizing,SDQ_SDQ_Generating_Impact,SDQ_SDQ_Hyperactivity,SDQ_SDQ_Internalizing,SDQ_SDQ_Peer_Problems,SDQ_SDQ_Prosocial,MRI_Track_Age_at_Scan
2,Nb4EetVPm3gs,1.0,1,0,0,0,0,0,0,0,...,1,14,2,8,5,7,6,4,9,8.239904
4,M09PXs7arQ5E,0.0,0,0,0,1,0,0,1,0,...,1,18,4,11,4,10,7,3,9,8.940679
6,DgRP31gu21O9,0.0,0,0,1,0,0,0,1,0,...,0,9,1,5,1,5,4,3,10,16.768195
7,ClMA0FwvFgLY,0.0,0,0,0,1,0,0,0,0,...,3,16,3,11,6,8,5,2,6,11.221309
8,NVUkahaJ6fhf,0.0,0,0,0,0,0,0,0,0,...,5,23,6,14,2,9,9,3,10,8.570841


In [67]:
#Takes 9-10 minutes to run this cell
# Calculate the correlation matrix
# correlation_matrix = train_MRITAS_df.drop('participant_id', axis=1).corr()

# # Extract the correlation values for column 'a'
# correlations = correlation_matrix['MRI_Track_Age_at_Scan']

# # Step 3: Sort the correlations and pick the top 10, excluding column 'a' itself
# top_10_columns = correlations.drop('MRI_Track_Age_at_Scan').abs().sort_values(ascending=False).head(10)

# print(top_10_columns)

APQ_P_APQ_P_PM 0.575243\
161throw_190thcolumn 0.264117\
7throw_9thcolumn 0.250352\
106throw_108thcolumn 0.249552\
69throw_80thcolumn 0.234965\
156throw_158thcolumn 0.201897\
69throw_159thcolumn 0.201224\
51throw_158thcolumn 0.198524\
80throw_177thcolumn 0.198048\
92throw_93thcolumn 0.197352

In [69]:
cols_to_include_2 = ['APQ_P_APQ_P_PM', '161throw_190thcolumn', '7throw_9thcolumn', '106throw_108thcolumn', '69throw_80thcolumn','51throw_158thcolumn', '80throw_177thcolumn', '92throw_93thcolumn']

In [70]:
train_MRITAS_df.shape

(853, 19973)

In [71]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

X = train_MRITAS_df[cols_to_include_2]
y = train_MRITAS_df['MRI_Track_Age_at_Scan']


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = LinearRegression()

model.fit(X_train, y_train)

# Make predictions on test data
y_pred = model.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
print(f"R²: {r2}")

mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")

rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")


new_MRITAS_vals = model.predict(rows_with_null_MRITAS[cols_to_include_2])
# Note: There are no null values in the testing dataset, that is why we are only predicting once.

R²: 0.46451444945364107
Mean Absolute Error (MAE): 1.8404247211783078
Mean Squared Error (MSE): 5.330261933620401
Root Mean Squared Error (RMSE): 2.308736003448727




In [72]:
# Fill Null Values
train_df.loc[train_df['MRI_Track_Age_at_Scan'].isna(), 'MRI_Track_Age_at_Scan'] = new_MRITAS_vals
train_df.isna().sum()

participant_id                      0
PreInt_Demos_Fam_Child_Ethnicity    0
Basic_Demos_Enroll_Year_2016        0
Basic_Demos_Enroll_Year_2017        0
Basic_Demos_Enroll_Year_2018        0
                                   ..
SDQ_SDQ_Hyperactivity               0
SDQ_SDQ_Internalizing               0
SDQ_SDQ_Peer_Problems               0
SDQ_SDQ_Prosocial                   0
MRI_Track_Age_at_Scan               0
Length: 19973, dtype: int64

In [73]:
test_df.isna().sum()

participant_id                       0
PreInt_Demos_Fam_Child_Ethnicity     0
Basic_Demos_Enroll_Year_2016         0
Basic_Demos_Enroll_Year_2017         0
Basic_Demos_Enroll_Year_2018         0
                                    ..
SDQ_SDQ_Hyperactivity               30
SDQ_SDQ_Internalizing               30
SDQ_SDQ_Peer_Problems               30
SDQ_SDQ_Prosocial                   30
MRI_Track_Age_at_Scan                0
Length: 19973, dtype: int64

In [74]:
# fill NA values using the mean
train_df.fillna({'MRI_Track_Age_at_Scan':train_df['MRI_Track_Age_at_Scan'].mean()}, inplace = True)
train_df.fillna({'PreInt_Demos_Fam_Child_Ethnicity':train_df['PreInt_Demos_Fam_Child_Ethnicity'].mean()}, inplace = True)

print(train_df.isna().sum().sum())

0


In [75]:
# Fill NAs of test data

for col in test_df.columns:
    if test_df[col].isna().sum() > 0:  # Check if the column has NaN values
        if test_df[col].dtype in ['float64', 'int64']:  # Ensure it's numeric
            test_df[col] = test_df[col].fillna(test_df[col].mean())  # Avoid inplace
        else:
            print(f"Skipping non-numeric column: {col}")

In [76]:
test_df.isna().sum().sum()

0

In [77]:
train_df.to_csv('/Users/mayapatel/UCLA_WiDs_Team-21/train-edited.csv', index=False)
test_df.to_csv('/Users/mayapatel/UCLA_WiDs_Team-21/test-edited.csv', index=False)