<a href="https://colab.research.google.com/github/SunJ2432/neurohackathon24/blob/main/neurohackathon_alzheimers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'mri-and-alzheimers:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1980%2F3398%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20241001%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20241001T025941Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Da7cd3213cbd3f47ef930e1782a5c9ed19ff155ac6d72f1854860b4f48c06552771e05da3de3d66d6160730e9232583c69952a4a92d50aecf107dd22405c7453d3b70dbbe74de08561402b2c536d8db1eff66641b73658bc9b2ff1fb6bbab981900556b659899906353eee2a7de620067e7b1e3d3921d142bb95c4733879ea172c64c586d0888fa376571c015c3e3becd0c001ea801a69ebd7cf45adbf93febea7ecc36e33e799d5e6c2607ea80969ed51e4de88f6f042afbf756682d3c82919e4a801f636c3340212d579ec1a6f5cbd284ddfee370e28ee918445b58480b9749cfc9faa75a16c15ce35a094d4abefa06f6b43ac72a548bc53f0058f955109b3a'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')
pd.set_option('display.max_columns', 200)
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Cleanup

In [None]:
cross_sectional = pd.read_csv('/kaggle/input/mri-and-alzheimers/oasis_cross-sectional.csv')
cross_sectional.head()


In [None]:
cross_sectional = cross_sectional.drop(columns = ['ID', 'Delay','Hand'])
cross_sectional = cross_sectional.dropna(subset = ['CDR'])
cross_sectional.head()

In [None]:
# convert gender to binary (0 = f, 1 = m)
dummies = pd.get_dummies(cross_sectional['M/F']).astype(int)
dummies = dummies.drop('F', axis='columns')
dummies.head()
cross_sectional = pd.concat([cross_sectional, dummies], axis='columns')
cross_sectional = cross_sectional.drop('M/F', axis='columns')
cross_sectional = cross_sectional.rename(columns = {"M":"isMale"})
cross_sectional.head()

In [None]:
# fill nulls in SES
ses_mean = cross_sectional.loc[:, 'SES'].mean()
cross_sectional['SES'] = cross_sectional['SES'].fillna(value=ses_mean)

# Linear Regression

In [None]:
column=[]
for i in cross_sectional.columns.array:
  if i!='CDR':
    column.append(i)

col = pd.Series(column).to_numpy()
x = [[] for _ in range(len(cross_sectional[col[0]]))]
for i in range(len(cross_sectional[col[0]])):
  for j in range(len(col)):
    x[i].append(cross_sectional[col[j]].to_numpy()[i])

In [None]:
def simpleLR(X, Y, X2, Y2):
  model = LinearRegression().fit(X, Y, sample_weight=None)
  b1 = model.coef_
  b0 = model.intercept_
  yhat = model.predict(X2)
  score = r2_score(Y2, yhat)
  return b0, b1, yhat, score

In [None]:
import matplotlib.pyplot as plt

results = {'Independent Variable': [], 'b0': [], 'b1': [], 'R2 Score': []}
train = cross_sectional.sample(frac = 0.8, random_state = 31)
test = cross_sectional.drop(train.index)

for i in col:
  plt.subplots()
  b0, b1, yhat, score = simpleLR(train[[i]], train[['CDR']],test[[i]], test[['CDR']])
  results['Independent Variable'].append(i)
  results['b0'].append(b0[0])
  results['b1'].append(b1[0][0])
  results['R2 Score'].append(score)
  plt.scatter(test[i], test['CDR'])
  plt.plot(test[i], yhat, 'b')
  plt.xlabel(i)
  plt.ylabel("CDR")
  plt.title(i + " VS CDR")
  plt.grid()
print(results['R2 Score'])

In [None]:
def MLR(X,Y,X2, Y2):
  regr = LinearRegression()
  regr.fit(X,Y)
  b0 = regr.intercept_
  b1 = regr.coef_
  yhat = regr.predict(X2)
  score = r2_score(Y2, yhat)
  return b0, b1, score



In [None]:
key = ['Number of Independent Variables', 'Best R2 Score']
key.extend(col)
zeros = [0 for i in range(7)]
value = [zeros.copy() for i in range(len(key))]
results = {k:v for (k, v) in zip(key, value)}
def gen_subset(indep, n):
  if n == 0:
      return [[]]
  subsets =[]
  for i in range(0, len(indep)):
      m = indep[i] #first heading
      remLst = indep[i + 1:] #the rest of indep
      remainlst_combo = gen_subset(remLst, n-1)
      for p in remainlst_combo:
        subsets.append([m, *p])
  return subsets

train = cross_sectional.sample(frac = 0.8, random_state = 31)
test = cross_sectional.drop(train.index)

for i in range(2, 9):
  best_b0, best_b1, best_score, best_indep = 0, 0, 0, []
  for j in gen_subset(col, i):
    x = np.asanyarray(train[j])
    y = np.asanyarray(train['CDR'])
    x2 = np.asanyarray(test[j])
    y2 = np.asanyarray(test['CDR'])
    b0, b1, score = MLR(x,y,x2,y2)
    if score > best_score:
      best_score, best_b0, best_b1, best_indep = score, b0, b1, j
  for name in best_indep:
    results[name][i-2] = 1
  results['Number of Independent Variables'][i-2] = i
  results['Best R2 Score'][i-2] = best_score

results = pd.DataFrame(results)
display(results)

# Random Forest Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
X = cross_sectional.drop(['CDR'], axis=1)
y = cross_sectional['CDR']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 28)

In [None]:
fc = RandomForestClassifier()
fc.fit(x_train, y_train.astype("string"))

In [None]:
yfc_pred = fc.predict(x_test)
dfc = pd.DataFrame({"prediction" : yfc_pred, "actual" : y_test.astype("string").tolist()})
print(classification_report(y_test.astype("string"), yfc_pred))

In [None]:
# Initializing the Random Forest Regression model with 10 decision trees
forest = RandomForestRegressor(n_estimators = 10, random_state = 0)

# Fitting the Random Forest Regression model to the data
forest.fit(x_train, y_train)

In [None]:
# Predicting the target values of the test set
y_pred = forest.predict(x_test)
df = pd.DataFrame({"prediction" : y_pred, "actual" : y_test.tolist()})
df

In [None]:
# RMSE (Root Mean Square Error)
rmse = float(format(np.sqrt(mean_squared_error(y_test, y_pred)), '.4f'))
print("\nRMSE: ", rmse)

In [None]:
# Normalized RMSE
max_val = cross_sectional['CDR'].max()
min_val = cross_sectional['CDR'].min()
n_rmse = rmse / (max_val - min_val)
print("Normalized RMSE: ", n_rmse)

# Data Visualization

In [None]:
# heatmap showing correlations
cross_sectional_corr = cross_sectional[cross_sectional.columns.values.tolist()].dropna().corr()

sns.heatmap(data=cross_sectional_corr, annot=True, fmt=".1f")

In [None]:
# General distributions of numerical data. Note that the vast majority of passengers spend very little money.
fig, ax = plt.subplots(7,1, figsize=(10, 10))
plt.subplots_adjust(top = 2)

sns.histplot(cross_sectional['Age'], palette=sns.color_palette('pastel')[1:3], bins=50, ax=ax[0]);
sns.countplot(cross_sectional, x='Educ', hue='isMale', palette=sns.color_palette('pastel')[1:3],ax=ax[1]);
sns.countplot(cross_sectional, x='SES', hue='isMale', palette=sns.color_palette('pastel')[1:3],ax=ax[2]);
sns.histplot(cross_sectional['MMSE'], palette=sns.color_palette('pastel')[1:3], bins=50, ax=ax[3]);
sns.histplot(cross_sectional['eTIV'], palette=sns.color_palette('pastel')[1:3], bins=50, ax=ax[4]);
sns.histplot(cross_sectional['nWBV'], palette=sns.color_palette('pastel')[1:3], bins=50, ax=ax[5]);
sns.histplot(cross_sectional['ASF'], palette=sns.color_palette('pastel')[1:3], bins=50, ax=ax[6]);

In [None]:
sns.pairplot(cross_sectional,
             vars=['Age', 'Educ', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF'],
            hue='isMale', palette=sns.color_palette('pastel')[1:3])
plt.show()

In [None]:
# Gender data in a pie
plt.pie(cross_sectional["isMale"].value_counts(),
        labels=['Male', 'Female'],
        autopct="%1.1f%%",
        colors=sns.color_palette('pastel')[0:2])
plt.title("Gender Distribution")

In [None]:
# Education data in a pie
plt.pie(cross_sectional["Educ"].value_counts(),
        labels=['High School Grad', 'Beyond College', 'College Grad', 'Some College', 'Less than High School Grad'],
        autopct="%1.1f%%",
        colors=sns.color_palette('pastel')[0:5])
plt.title("Education Distribution")

In [None]:
# SES data in a pie
mask = cross_sectional["SES"] == ses_mean
plt.pie(cross_sectional["SES"][~mask].value_counts(),
        labels=cross_sectional["SES"][~mask].value_counts().keys(),
        autopct="%1.1f%%",
        colors=sns.color_palette('pastel')[0:6])
plt.title("Socioeconomic Status Distribution")

In [None]:
#Age bar graph
sns.histplot(x=cross_sectional["Age"],
             #hue="Transported",
             #data=cross_sectional,
             #palette=sns.color_palette('pastel')[1:3])
            )
plt.title("Age Distribution");

In [None]:
#MMSE bar graph
sns.histplot(x=cross_sectional["MMSE"],
             #hue="Transported",
             #data=cross_sectional,
             #palette=sns.color_palette('pastel')[1:3])
            )
plt.title("Mini Mental State Examination Distribution");

In [None]:
#eTIV
sns.histplot(x=cross_sectional["eTIV"],
             #hue="Transported",
             #data=cross_sectional,
             #palette=sns.color_palette('pastel')[1:3])
            )
plt.title("Estimated Total Intracranial Volume Distribution");

In [None]:
#nWBV
sns.histplot(x=cross_sectional["nWBV"],
             #hue="Transported",
             #data=cross_sectional,
             #palette=sns.color_palette('pastel')[1:3])
            )
plt.title("Normalize Whole Brain Volume Distribution");

In [None]:
#ASF
sns.histplot(x=cross_sectional["ASF"],
             #hue="Transported",
             #data=cross_sectional,
             #palette=sns.color_palette('pastel')[1:3])
            )
plt.title("ASF Distribution");