<a href="https://colab.research.google.com/github/Only-Mike/ADHD/blob/main/ADHD_Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Import necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
!git clone 'https://github.com/Only-Mike/ADHD.git'

Cloning into 'ADHD'...
remote: Enumerating objects: 853, done.[K
remote: Counting objects: 100% (370/370), done.[K
remote: Compressing objects: 100% (181/181), done.[K
remote: Total 853 (delta 223), reused 303 (delta 189), pack-reused 483[K
Receiving objects: 100% (853/853), 17.08 MiB | 16.27 MiB/s, done.
Resolving deltas: 100% (469/469), done.


In [3]:
df = pd.read_csv('/content/ADHD/datasets/KKI_phenotypic.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83 entries, 0 to 82
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ScanDir ID       83 non-null     int64  
 1   Site             83 non-null     int64  
 2   Gender           83 non-null     int64  
 3   Age              83 non-null     float64
 4   Handedness       83 non-null     int64  
 5   DX               83 non-null     int64  
 6   Secondary Dx     15 non-null     object 
 7   ADHD Measure     83 non-null     int64  
 8   ADHD Index       83 non-null     int64  
 9   Inattentive      83 non-null     int64  
 10  Hyper/Impulsive  83 non-null     int64  
 11  IQ Measure       83 non-null     int64  
 12  Verbal IQ        83 non-null     int64  
 13  Performance IQ   83 non-null     int64  
 14  Full2 IQ         0 non-null      float64
 15  Full4 IQ         83 non-null     int64  
 16  Med Status       83 non-null     int64  
 17  QC_Rest_1        8

In [5]:
#dropping unrelevant columns
df = df.drop(columns = ['Site', 'ADHD Measure', 'IQ Measure', 'Full2 IQ', 'QC_Rest_1', 'QC_Rest_2', 'QC_Rest_3', 'QC_Rest_4', 'QC_Anatomical_1', 'QC_Anatomical_2', 'DX'])

In [6]:
df.describe() #We have some false values in the following columns: ADHD Index, Inattentive and Hyper/Impulsive

Unnamed: 0,ScanDir ID,Gender,Age,Handedness,ADHD Index,Inattentive,Hyper/Impulsive,Verbal IQ,Performance IQ,Full4 IQ,Med Status
count,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0
mean,3449295.0,0.554217,10.243253,0.927711,14.963855,15.26506,15.73494,113.060241,108.686747,110.012048,1.180723
std,2098556.0,0.500073,1.346601,0.303762,198.036178,198.098958,198.141738,14.693909,11.995861,11.935287,0.387128
min,1018959.0,0.0,8.02,0.0,-999.0,-999.0,-999.0,81.0,79.0,85.0,1.0
25%,2005148.0,0.0,9.1,1.0,42.0,42.0,43.0,104.0,102.0,101.5,1.0
50%,2768273.0,1.0,10.12,1.0,46.0,48.0,48.0,112.0,108.0,111.0,1.0
75%,3915209.0,1.0,11.095,1.0,61.0,60.0,59.5,121.5,119.0,119.0,1.0
max,9922944.0,1.0,12.99,2.0,90.0,90.0,90.0,146.0,137.0,134.0,2.0


In [7]:
#Round age for fewer unique values and making into integer
df['Age'] = df['Age'].round(decimals = 0)
df['Age'] = df['Age'].astype(int)

In [8]:
#Removes rows with -999 in the following columns
df = df[df['Inattentive'] != -999]
df = df[df['Hyper/Impulsive'] != -999]
df = df[df['ADHD Index'] != -999]

In [9]:
#But as we can see with the min function, it removed the false values of -999
df.min()

  df.min()


ScanDir ID         1018959
Gender                   0
Age                      8
Handedness               0
ADHD Index              40
Inattentive             40
Hyper/Impulsive         41
Verbal IQ               81
Performance IQ          79
Full4 IQ                85
Med Status               1
dtype: int64

In [10]:
df['Secondary Dx '].unique()

array([nan, 'Simple phobia', 'Simple Phobia', 'simple phobias', 'ODD',
       'Simple Phobia ', 'ODD; Phobia', 'Specific phobia', 'Phobia',
       'social and simple phobia '], dtype=object)

In [11]:
#Making none secondary dx into 0 and any secondary dx into 1
df['Secondary Dx '].replace(('Simple phobia', 'Simple Phobia', 'simple phobias', 'ODD', 'Simple Phobia ', 'ODD; Phobia', 'Specific phobia', 'Phobia', 'social and simple phobia '), (1, 1, 1, 1, 1, 1, 1, 1, 1), inplace=True)
df['Secondary Dx '] = df['Secondary Dx '].fillna(0).astype(int)

In [12]:
#Checking that it worked as intended
df['Secondary Dx '].unique()

array([0, 1])

In [13]:
pip install sdv -q

[K     |████████████████████████████████| 102 kB 3.7 MB/s 
[K     |████████████████████████████████| 61 kB 337 kB/s 
[K     |████████████████████████████████| 47 kB 3.8 MB/s 
[K     |████████████████████████████████| 53 kB 1.7 MB/s 
[K     |████████████████████████████████| 139 kB 38.8 MB/s 
[K     |████████████████████████████████| 1.6 MB 60.9 MB/s 
[K     |████████████████████████████████| 9.4 MB 54.9 MB/s 
[K     |████████████████████████████████| 965 kB 81.1 MB/s 
[K     |████████████████████████████████| 295 kB 77.2 MB/s 
[K     |████████████████████████████████| 280 kB 76.5 MB/s 
[K     |████████████████████████████████| 662 kB 56.6 MB/s 
[K     |████████████████████████████████| 15.3 MB 279 kB/s 
[?25h

In [14]:
from sdv.tabular import GaussianCopula
model = GaussianCopula()
model.fit(df)

In [15]:
#Creating the synthetic data
synthetic_data = model.sample(2000)
synthetic_data.head()

Unnamed: 0,ScanDir ID,Gender,Age,Handedness,Secondary Dx,ADHD Index,Inattentive,Hyper/Impulsive,Verbal IQ,Performance IQ,Full4 IQ,Med Status
0,2949669,1,10,1,0,47,52,50,97,95,105,1
1,4709643,1,10,1,0,40,40,41,141,95,119,1
2,1760951,0,10,1,0,58,50,64,116,107,101,1
3,2573841,1,12,0,1,84,80,66,113,104,99,2
4,4979445,1,10,1,0,52,49,58,119,109,116,1


In [16]:
#Checking the number of unique values in each column to see if something seems odd
synthetic_data.nunique()

ScanDir ID         2000
Gender                2
Age                   6
Handedness            3
Secondary Dx          2
ADHD Index           51
Inattentive          51
Hyper/Impulsive      49
Verbal IQ            66
Performance IQ       59
Full4 IQ             50
Med Status            2
dtype: int64

In [17]:
#Appending the synthetic data to the original df
df = df.append([synthetic_data])

In [18]:
#There is now 2080 rows each with a unique id
df.nunique()

ScanDir ID         2080
Gender                2
Age                   6
Handedness            3
Secondary Dx          2
ADHD Index           51
Inattentive          51
Hyper/Impulsive      50
Verbal IQ            66
Performance IQ       59
Full4 IQ             50
Med Status            2
dtype: int64

# Scaling the data

In [19]:
scaler = MinMaxScaler()

In [20]:
scaler.fit(df)

MinMaxScaler()

In [21]:
scaler.transform(df)

array([[0.15185032, 0.        , 0.6       , ..., 0.68965517, 0.75510204,
        0.        ],
       [0.11311272, 0.        , 1.        , ..., 0.5       , 0.42857143,
        0.        ],
       [0.27129639, 0.        , 0.        , ..., 0.32758621, 0.08163265,
        0.        ],
       ...,
       [0.10178049, 1.        , 0.6       , ..., 0.29310345, 0.20408163,
        0.        ],
       [0.35970456, 0.        , 1.        , ..., 0.34482759, 0.40816327,
        1.        ],
       [0.09999276, 1.        , 1.        , ..., 0.        , 0.04081633,
        1.        ]])

# ADHD Index vs Inattentive vs Hyper/Impulsive

In [22]:
corr_matrix = df.corr()
corr_matrix['ADHD Index'].sort_values(ascending = False)

ADHD Index         1.000000
Inattentive        0.849335
Hyper/Impulsive    0.690611
Secondary Dx       0.252371
Age                0.082720
Med Status         0.071583
Handedness         0.002951
ScanDir ID         0.001408
Verbal IQ         -0.032743
Performance IQ    -0.065583
Gender            -0.128004
Full4 IQ          -0.154536
Name: ADHD Index, dtype: float64

In [23]:
df.corr()

Unnamed: 0,ScanDir ID,Gender,Age,Handedness,Secondary Dx,ADHD Index,Inattentive,Hyper/Impulsive,Verbal IQ,Performance IQ,Full4 IQ,Med Status
ScanDir ID,1.0,0.0993,-0.111897,0.08673,0.142877,0.001408,0.009952,-0.086721,-0.0055,-0.049536,0.016822,-0.040831
Gender,0.0993,1.0,0.054733,-0.10066,0.01058,-0.128004,-0.175326,-0.068936,0.031347,-0.001761,-0.124517,-0.072541
Age,-0.111897,0.054733,1.0,0.058983,0.039673,0.08272,0.021634,0.13338,-0.013767,-0.186854,-0.18519,-0.119846
Handedness,0.08673,-0.10066,0.058983,1.0,-0.039906,0.002951,0.014145,0.05935,0.036019,-0.031897,0.031721,0.029112
Secondary Dx,0.142877,0.01058,0.039673,-0.039906,1.0,0.252371,0.256931,0.227278,0.105358,-0.010726,0.080367,0.019177
ADHD Index,0.001408,-0.128004,0.08272,0.002951,0.252371,1.0,0.849335,0.690611,-0.032743,-0.065583,-0.154536,0.071583
Inattentive,0.009952,-0.175326,0.021634,0.014145,0.256931,0.849335,1.0,0.683117,-0.047032,-0.049647,-0.047683,0.081133
Hyper/Impulsive,-0.086721,-0.068936,0.13338,0.05935,0.227278,0.690611,0.683117,1.0,-0.067423,-0.108861,-0.121214,0.087429
Verbal IQ,-0.0055,0.031347,-0.013767,0.036019,0.105358,-0.032743,-0.047032,-0.067423,1.0,0.115345,0.524772,-0.167994
Performance IQ,-0.049536,-0.001761,-0.186854,-0.031897,-0.010726,-0.065583,-0.049647,-0.108861,0.115345,1.0,0.513779,0.024435


In [24]:
from pandas.plotting import scatter_matrix

attributes = ["Hyper/Impulsive", "ADHD Index", "Inattentive"]
scatter_matrix(df[attributes], figsize = (16,12))
plt.show()

ImportError: ignored

<Figure size 1152x864 with 9 Axes>

#Linear Regression

In [25]:
# Assign X and y for training
X = df[["Inattentive", "Hyper/Impulsive", "Secondary Dx ", "Gender", "Age", "Handedness"]].values
y = df["ADHD Index"]

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 43)

In [27]:
model = LinearRegression()

In [28]:
model.fit(X_train, y_train)

LinearRegression()

In [29]:
X_new = [[75, 90, 1, 1, 20, 2]]
print(model.predict(X_new))


[80.0985952]


In [30]:
print(model.score(X_test, y_test))

0.7082322466471451


# HARD coding features in the app

In [31]:
if model.predict(X_new) >= 60:
    print('You most likely have ADHD or ADD')
else: 
    print('You dont have ADHD')


You most likely have ADHD or ADD


#K-Nearest Neighbors


In [32]:
from sklearn.neighbors import KNeighborsRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)

In [33]:
knn_model = KNeighborsRegressor(n_neighbors=3)

In [34]:
knn_model.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=3)

In [35]:
knn_X_new = [[75, 90, 1, 1, 20, 2]]
print(knn_model.predict(knn_X_new))


[73.]


In [36]:
print(knn_model.score(X_test, y_test))

0.6484111184557031


In [37]:
if model.predict(X_new) >= 60:
    print('You most likely have ADHD or ADD')
else: 
    print('You dont have ADHD')


You most likely have ADHD or ADD


# Random Forest Regressor

In [38]:
from sklearn.ensemble import RandomForestRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)

In [39]:
rfg_model = RandomForestRegressor()

In [40]:
rfg_model.fit(X_train, y_train)

RandomForestRegressor()

In [41]:
rfg_X_new = [[75, 90, 1, 1, 20, 2]]
print(rfg_model.predict(rfg_X_new))

[73.03]


In [42]:
print(rfg_model.score(X_test, y_test))

0.658506999959853


In [43]:
if model.predict(X_new) >= 60:
    print('You most likely have ADHD or ADD')
else: 
    print('You dont have ADHD')

You most likely have ADHD or ADD
