<a href="https://colab.research.google.com/github/Only-Mike/ADHD/blob/main/ADHD_Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt -U
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
!git clone 'https://github.com/Only-Mike/ADHD.git'

Cloning into 'ADHD'...
remote: Enumerating objects: 822, done.[K
remote: Counting objects: 100% (339/339), done.[K
remote: Compressing objects: 100% (150/150), done.[K
remote: Total 822 (delta 208), reused 302 (delta 189), pack-reused 483[K
Receiving objects: 100% (822/822), 16.03 MiB | 9.22 MiB/s, done.
Resolving deltas: 100% (454/454), done.


In [None]:
df = pd.read_csv('/content/ADHD/datasets/KKI_phenotypic.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83 entries, 0 to 82
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ScanDir ID       83 non-null     int64  
 1   Site             83 non-null     int64  
 2   Gender           83 non-null     int64  
 3   Age              83 non-null     float64
 4   Handedness       83 non-null     int64  
 5   DX               83 non-null     int64  
 6   Secondary Dx     15 non-null     object 
 7   ADHD Measure     83 non-null     int64  
 8   ADHD Index       83 non-null     int64  
 9   Inattentive      83 non-null     int64  
 10  Hyper/Impulsive  83 non-null     int64  
 11  IQ Measure       83 non-null     int64  
 12  Verbal IQ        83 non-null     int64  
 13  Performance IQ   83 non-null     int64  
 14  Full2 IQ         0 non-null      float64
 15  Full4 IQ         83 non-null     int64  
 16  Med Status       83 non-null     int64  
 17  QC_Rest_1        8

In [None]:
#dropping unrelevant columns
df = df.drop(columns = ['Site', 'ADHD Measure', 'IQ Measure', 'Full2 IQ', 'QC_Rest_1', 'QC_Rest_2', 'QC_Rest_3', 'QC_Rest_4', 'QC_Anatomical_1', 'QC_Anatomical_2', 'DX'])

In [None]:
df.describe() #We have some false values in the following columns: ADHD Index, Inattentive and Hyper/Impulsive

Unnamed: 0,ScanDir ID,Gender,Age,Handedness,ADHD Index,Inattentive,Hyper/Impulsive,Verbal IQ,Performance IQ,Full4 IQ,Med Status
count,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0
mean,3449295.0,0.554217,10.243253,0.927711,14.963855,15.26506,15.73494,113.060241,108.686747,110.012048,1.180723
std,2098556.0,0.500073,1.346601,0.303762,198.036178,198.098958,198.141738,14.693909,11.995861,11.935287,0.387128
min,1018959.0,0.0,8.02,0.0,-999.0,-999.0,-999.0,81.0,79.0,85.0,1.0
25%,2005148.0,0.0,9.1,1.0,42.0,42.0,43.0,104.0,102.0,101.5,1.0
50%,2768273.0,1.0,10.12,1.0,46.0,48.0,48.0,112.0,108.0,111.0,1.0
75%,3915209.0,1.0,11.095,1.0,61.0,60.0,59.5,121.5,119.0,119.0,1.0
max,9922944.0,1.0,12.99,2.0,90.0,90.0,90.0,146.0,137.0,134.0,2.0


In [None]:
#Round age for fewer unique values and making into integer
df['Age'] = df['Age'].round(decimals = 0)
df['Age'] = df['Age'].astype(int)

In [None]:
#Removes rows with -999 in the following columns
df = df[df['Inattentive'] != -999]
df = df[df['Hyper/Impulsive'] != -999]
df = df[df['ADHD Index'] != -999]

In [None]:
#But as we can see with the min function, it removed the false values of -999
df.min()

  df.min()


ScanDir ID         1018959
Gender                   0
Age                      8
Handedness               0
ADHD Index              40
Inattentive             40
Hyper/Impulsive         41
Verbal IQ               81
Performance IQ          79
Full4 IQ                85
Med Status               1
dtype: int64

In [None]:
df['Secondary Dx '].unique()

array([nan, 'Simple phobia', 'Simple Phobia', 'simple phobias', 'ODD',
       'Simple Phobia ', 'ODD; Phobia', 'Specific phobia', 'Phobia',
       'social and simple phobia '], dtype=object)

In [None]:
#Making none secondary dx into 0 and any secondary dx into 1
df['Secondary Dx '].replace(('Simple phobia', 'Simple Phobia', 'simple phobias', 'ODD', 'Simple Phobia ', 'ODD; Phobia', 'Specific phobia', 'Phobia', 'social and simple phobia '), (1, 1, 1, 1, 1, 1, 1, 1, 1), inplace=True)
df['Secondary Dx '] = df['Secondary Dx '].fillna(0).astype(int)

In [None]:
#Checking that it worked as intended
df['Secondary Dx '].unique()

array([0, 1])

In [None]:
pip install sdv -q

[K     |████████████████████████████████| 102 kB 4.4 MB/s 
[K     |████████████████████████████████| 61 kB 325 kB/s 
[K     |████████████████████████████████| 53 kB 1.5 MB/s 
[K     |████████████████████████████████| 1.6 MB 45.7 MB/s 
[K     |████████████████████████████████| 47 kB 3.8 MB/s 
[K     |████████████████████████████████| 139 kB 52.6 MB/s 
[K     |████████████████████████████████| 9.4 MB 48.9 MB/s 
[K     |████████████████████████████████| 965 kB 64.0 MB/s 
[K     |████████████████████████████████| 295 kB 61.7 MB/s 
[K     |████████████████████████████████| 280 kB 28.8 MB/s 
[K     |████████████████████████████████| 662 kB 46.3 MB/s 
[K     |████████████████████████████████| 15.3 MB 591 kB/s 


In [None]:
from sdv.tabular import GaussianCopula
model = GaussianCopula()
model.fit(df)

In [None]:
#Creating the synthetic data
synthetic_data = model.sample(2000)
synthetic_data.head()

Unnamed: 0,ScanDir ID,Gender,Age,Handedness,Secondary Dx,ADHD Index,Inattentive,Hyper/Impulsive,Verbal IQ,Performance IQ,Full4 IQ,Med Status
0,7931949,1,9,1,0,46,46,49,104,115,122,2
1,4423641,0,10,1,0,46,42,44,119,111,128,1
2,4551278,0,9,1,0,52,59,51,127,126,113,1
3,2852519,1,12,1,0,45,44,51,105,82,90,1
4,4538788,1,11,1,0,55,52,58,104,100,103,1


In [None]:
#Checking the number of unique values in each column to see if something seems odd
synthetic_data.nunique()

ScanDir ID         2000
Gender                2
Age                   6
Handedness            3
Secondary Dx          2
ADHD Index           51
Inattentive          51
Hyper/Impulsive      50
Verbal IQ            66
Performance IQ       59
Full4 IQ             50
Med Status            2
dtype: int64

In [None]:
#Appending the synthetic data to the original df
df = df.append([synthetic_data])

In [None]:
#There is now 2080 rows each with a unique id
df.nunique()

ScanDir ID         2080
Gender                2
Age                   6
Handedness            3
Secondary Dx          2
ADHD Index           51
Inattentive          51
Hyper/Impulsive      50
Verbal IQ            66
Performance IQ       59
Full4 IQ             50
Med Status            2
dtype: int64

# ADHD Index vs Inattentive vs Hyper/Impulsive

In [None]:
corr_matrix = df.corr()
corr_matrix['ADHD Index'].sort_values(ascending = False)

ADHD Index         1.000000
Inattentive        0.851851
Hyper/Impulsive    0.688143
Secondary Dx       0.248834
Med Status         0.090753
Age                0.054162
Handedness         0.024632
Verbal IQ         -0.015234
ScanDir ID        -0.018506
Performance IQ    -0.078079
Full4 IQ          -0.116550
Gender            -0.193227
Name: ADHD Index, dtype: float64

In [None]:
df.corr()

Unnamed: 0,ScanDir ID,Gender,Age,Handedness,Secondary Dx,ADHD Index,Inattentive,Hyper/Impulsive,Verbal IQ,Performance IQ,Full4 IQ,Med Status
ScanDir ID,1.0,0.130752,-0.143995,0.08621,0.07394,-0.018506,-0.048784,-0.116195,-0.027286,-0.060121,-0.032663,-0.039782
Gender,0.130752,1.0,0.050132,-0.065301,-0.016505,-0.193227,-0.247429,-0.100289,0.033451,0.045079,-0.101275,-0.045388
Age,-0.143995,0.050132,1.0,0.058308,0.010459,0.054162,-0.012527,0.137022,-0.065309,-0.14331,-0.196153,-0.122454
Handedness,0.08621,-0.065301,0.058308,1.0,-0.047603,0.024632,0.005299,0.064098,-0.022993,-0.049655,-0.016153,0.063587
Secondary Dx,0.07394,-0.016505,0.010459,-0.047603,1.0,0.248834,0.248468,0.264932,0.076315,-0.019492,0.083187,0.047509
ADHD Index,-0.018506,-0.193227,0.054162,0.024632,0.248834,1.0,0.851851,0.688143,-0.015234,-0.078079,-0.11655,0.090753
Inattentive,-0.048784,-0.247429,-0.012527,0.005299,0.248468,0.851851,1.0,0.680242,-0.029562,-0.03494,0.005903,0.084776
Hyper/Impulsive,-0.116195,-0.100289,0.137022,0.064098,0.264932,0.688143,0.680242,1.0,-0.046008,-0.077491,-0.074894,0.168096
Verbal IQ,-0.027286,0.033451,-0.065309,-0.022993,0.076315,-0.015234,-0.029562,-0.046008,1.0,0.1602,0.531084,-0.160624
Performance IQ,-0.060121,0.045079,-0.14331,-0.049655,-0.019492,-0.078079,-0.03494,-0.077491,0.1602,1.0,0.528536,-0.002563


In [None]:
from pandas.plotting import scatter_matrix

attributes = ["Hyper/Impulsive", "ADHD Index", "Inattentive"]
scatter_matrix(df[attributes], figsize = (16,12))
plt.show()

ImportError: ignored

<Figure size 1152x864 with 9 Axes>

#Linear Regression

In [None]:
#Spørg Roman hvorfor [[]]
X = df[["Inattentive", "Hyper/Impulsive", "Secondary Dx ", "Gender", "Age", "Handedness"]].values
y = df["ADHD Index"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)

In [None]:
from sklearn.ensemble import RandomForestRegressor #Skal ændres tilbage til linear regression
model = RandomForestRegressor()

In [None]:
model.fit(X_train, y_train)

RandomForestRegressor()

In [None]:
X_new = [[0, 5, 1, 1, 20, 2]]
print(model.predict(X_new))


[43.36466667]


In [None]:
print(model.score(X_test, y_test))

0.6585494955427826


In [None]:
if model.predict(X_new) >= 60:
    print('You most likely have ADHD or ADD')
else: 
    print('you dont have ADHD')


you dont have ADHD


In [None]:
# from sklearn.metrics import mean_squared_error
# from sklearn.metrics import mean_absolute_error


# model_mae = mean_absolute_error(y_test, y_pred)
# model_rmse = mean_squared_error(y_test, y_pred, squared = False)

# print(model_mae)
# print(model_rmse)

In [None]:
# y = ADHD Index
# a = y værdi hvis man har 0 i hyper og inattentive
# b = hyper/impulsive
# c = Inattentive
# x = 
# z = 

In [None]:
#y = a + 0.75 * x + 0.85 * z

In [None]:
85 * 0.85

#K-Nearest Neighbors


In [None]:
from sklearn.neighbors import KNeighborsRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)

In [None]:
model = KNeighborsRegressor(n_neighbors=3)

In [None]:
model.fit(X_train, y_train)

In [None]:
X_new = [[90, 75]]
print(model.predict(X_new))


In [None]:
print(model.score(X_test, y_test))

In [None]:
if model.predict(X_new) >= 60:
    print('You most likely have ADHD or ADD')
else: 
    print('you dont have ADHD')


# Classifier