### Final Code for Sex differences project 

### 1. Prep data set 

In [None]:
%matplotlib inline
import glob
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split 

import pandas as pd
import seaborn as sns; sns.set()
import os
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline 
from sklearn.svm import LinearSVC


from sklearn.metrics import confusion_matrix


In [None]:
#import data set
df = pd.read_excel(r"C:\Users\Utilisateur\Documents\Functional connectivity\Sex_diff_project\HCP_database.xlsx")

In [None]:
# Filter left-handers (>=50 handedness) 
df = df[df["Handedness"] >= 50].reset_index()
df

In [None]:
# transform M and F with 0 and 1 

df.rename(columns={"Subject": "subject_label"}, inplace=True)
df["Gender"] = df["Gender"].replace({"M": 0, "F": 1})
df

In [None]:
print(df['Gender'].value_counts()[0])
print(df['Gender'].value_counts()[1])

In [None]:
df.to_csv("C:/Users/Utilisateur/Documents/Functional connectivity/Sex_diff_project/Final_HCP_database.csv")

In [None]:
# load dataset containing sex and participant id
df = pd.read_csv("/home/xlajoie/Desktop/Final_HCP_database.csv")

In [None]:
# path to seed-to-voxel matrices

path_to_files = "/home/xlajoie/Desktop/Seeds2voxels_n100"

In [None]:
path_list = glob.glob("Seeds2voxels_n100/*_Seeds2Voxels.nii.gz")
path_list.sort()

In [None]:
# get only the npy files out of the Seeds2voxels folder which also has nii.gz files 

ls_sub = [(file) for file in os.listdir(path_to_files) if file[-3:] == "npy"]

x_correl = []
y_sex = []
for filename in ls_sub:
    sub_id = filename.split("_")[0]
    # if sub_id is in csv
    if len(df.loc[df['subject_label'] == int(sub_id)]) != 0:
        x_correl.append(np.load(os.path.join(path_to_files, filename)).flatten())
        y_sex.append(df.loc[df['subject_label'] == int(sub_id)]["Gender"].values[0])

x_correl = np.asarray(x_correl)

In [None]:
print(sum(y_sex), len(y_sex))

#### 2. Classifier 

In [None]:
# split the sample o training/test with a 80/20 % ratio 
# and stratify sex by class, also shuffle the data

X_train, X_test, y_train, y_test = train_test_split( 
                                                    x_correl,  # x 
                                                    y_sex,       # y 
                                                    test_size = 0.2, # 80%/20% split 
                                                    shuffle = True,  #shuffle dataset before splitting
                                                    stratify = y_sex,  # keep distribution of sex_class consistent between train and test sets
                                                    random_state = 123) #same shuffle each time 

print('train:', len(X_train),'test:', len(X_test))

In [None]:
# comparing different # of features for find how many explain the most variance 

score = []
max_feature = int(0.9*len(X_train))
n_components_range = range(0, max_feature, 10) # A range of different values for n_components

for n_comp in tqdm(n_components_range):
    if n_comp == 0:
        model = LinearSVC(max_iter=2000)
    else:
        model = Pipeline([
            ('feature_selection', PCA(n_comp)),
            ('prediction', LinearSVC(max_iter=2000))
                      ])
    #score = cross_val_score(model, X_train, y_train, cv=10)
    score.append(cross_val_score(model, X_train, y_train, cv=10, n_jobs = 3).mean())

In [None]:
print(score)
x = list(n_components_range)

In [None]:
plt.plot(x,score, color = "b", marker = "*")
plt.xlabel("Features")
plt.ylabel("Cross_val Score")

In [None]:
best_ncomp = n_components_range[np.array(score).argmax()]
best_ncomp

In [None]:
if best_ncomp == 0:
    model = LinearSVC(max_iter=2000)
else:
    model = Pipeline([
            ('feature_selection', PCA(best_ncomp)),
            ('prediction', LinearSVC(max_iter=2000))
                      ])
model.fit(X_train,y_train) #fit the model/ train the model
y_pred = model.predict(X_test)

#calculate the model accuracy
acc = model.score(X_test, y_test)

#compute the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = conf_matrix.ravel()
df = pd.DataFrame(data=[[tn, fp],[fn,tp]], index= ["True", "False"], columns = ["Positive", "Negative"])
print(df.head())

# print results
print('accuracy (r2) =', acc)

In [None]:
coef= model.coef_[0]
coef_reshape = np.reshape(coef, (8, 193369))
coef_reshape