In [1]:
path = r'alzheimer.csv'

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestClassifier

In [3]:
data = pd.read_csv(path)

In [4]:
data

Unnamed: 0,Group,M/F,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,Nondemented,M,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,Nondemented,M,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,Demented,M,75,12,,23.0,0.5,1678,0.736,1.046
3,Demented,M,76,12,,28.0,0.5,1738,0.713,1.010
4,Demented,M,80,12,,22.0,0.5,1698,0.701,1.034
...,...,...,...,...,...,...,...,...,...,...
368,Demented,M,82,16,1.0,28.0,0.5,1693,0.694,1.037
369,Demented,M,86,16,1.0,26.0,0.5,1688,0.675,1.040
370,Nondemented,F,61,13,2.0,30.0,0.0,1319,0.801,1.331
371,Nondemented,F,63,13,2.0,30.0,0.0,1327,0.796,1.323


In [5]:
data.shape

(373, 10)

In [6]:
data.describe()

Unnamed: 0,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
count,373.0,373.0,354.0,371.0,373.0,373.0,373.0,373.0
mean,77.013405,14.597855,2.460452,27.342318,0.290885,1488.128686,0.729568,1.195461
std,7.640957,2.876339,1.134005,3.683244,0.374557,176.139286,0.037135,0.138092
min,60.0,6.0,1.0,4.0,0.0,1106.0,0.644,0.876
25%,71.0,12.0,2.0,27.0,0.0,1357.0,0.7,1.099
50%,77.0,15.0,2.0,29.0,0.0,1470.0,0.729,1.194
75%,82.0,16.0,3.0,30.0,0.5,1597.0,0.756,1.293
max,98.0,23.0,5.0,30.0,2.0,2004.0,0.837,1.587


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 0 to 372
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Group   373 non-null    object 
 1   M/F     373 non-null    object 
 2   Age     373 non-null    int64  
 3   EDUC    373 non-null    int64  
 4   SES     354 non-null    float64
 5   MMSE    371 non-null    float64
 6   CDR     373 non-null    float64
 7   eTIV    373 non-null    int64  
 8   nWBV    373 non-null    float64
 9   ASF     373 non-null    float64
dtypes: float64(5), int64(3), object(2)
memory usage: 29.3+ KB


In [8]:
data = data.apply(LabelEncoder().fit_transform)

In [9]:
data.head()

Unnamed: 0,Group,M/F,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,2,1,27,5,1,14,0,284,35,1
1,2,1,28,5,1,17,0,285,20,0
2,1,1,15,3,5,10,1,231,73,52
3,1,1,16,3,5,15,1,254,51,32
4,1,1,20,3,5,9,1,238,40,46


In [10]:
X = data.drop(['Group'], axis=1)
Y = data.Group

In [11]:
X

Unnamed: 0,M/F,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,1,27,5,1,14,0,284,35,1
1,1,28,5,1,17,0,285,20,0
2,1,15,3,5,10,1,231,73,52
3,1,16,3,5,15,1,254,51,32
4,1,20,3,5,9,1,238,40,46
...,...,...,...,...,...,...,...,...,...
368,1,22,7,0,15,1,236,33,48
369,1,26,7,0,13,1,233,14,50
370,0,1,4,1,17,0,51,126,215
371,0,3,4,1,17,0,56,124,211


In [12]:
Y

0      2
1      2
2      1
3      1
4      1
      ..
368    1
369    1
370    2
371    2
372    2
Name: Group, Length: 373, dtype: int32

In [13]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.2,random_state=2)

In [14]:
x_train

Unnamed: 0,M/F,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
225,0,10,2,3,9,2,125,60,144
100,1,8,7,0,1,2,247,21,39
270,0,22,9,0,16,0,123,0,147
108,0,34,11,0,16,0,146,35,128
344,1,12,7,3,15,1,262,69,23
...,...,...,...,...,...,...,...,...,...
299,0,20,6,1,7,2,253,24,33
22,0,21,3,3,17,0,18,53,246
72,1,22,3,3,14,1,149,67,126
15,1,8,3,1,14,1,135,128,137


In [15]:
x_test

Unnamed: 0,M/F,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
173,0,13,3,3,3,2,150,35,126
205,1,6,3,3,17,1,126,111,144
70,0,13,4,3,17,0,81,106,191
118,0,11,5,1,15,0,138,91,135
32,1,26,3,2,14,0,265,96,20
...,...,...,...,...,...,...,...,...,...
303,0,20,4,1,15,0,64,97,205
3,1,16,3,5,15,1,254,51,32
272,0,7,3,1,12,0,58,96,209
163,0,6,9,1,17,0,171,63,109


In [16]:
y_train

225    1
100    1
270    2
108    2
344    1
      ..
299    1
22     2
72     1
15     1
168    2
Name: Group, Length: 298, dtype: int32

In [17]:
y_test

173    1
205    1
70     2
118    2
32     2
      ..
303    2
3      1
272    0
163    2
292    2
Name: Group, Length: 75, dtype: int32

In [18]:
print('size x_train = ', x_train.shape)
print('size x_test = ', x_test.shape)
print('size y_train = ', y_train.shape)
print('size y_test = ', y_test.shape)

size x_train =  (298, 9)
size x_test =  (75, 9)
size y_train =  (298,)
size y_test =  (75,)


In [19]:
model = RandomForestClassifier()

In [20]:
model.fit(x_train, y_train)

In [21]:
x_test.size

675

In [22]:
y_test.size

75

In [23]:
result = model.predict(x_test)

In [24]:
score = accuracy_score(result, y_test)

In [25]:
percentage_score = round(score*100,2)

In [26]:
print(f'Model accuracy is : {percentage_score}%')

Model accuracy is : 94.67%


In [27]:
import pickle
filename = "alzheimer_model.sav"
pickle.dump(model,open(filename,"wb"))