This AI project develops a model to predict Schizophrenia using health records obtained from the University of Lagos Teaching Hospital Nigeria. Dataset cases were recorded between years 2013 and 2018. The dataset contains features of diagnosis of schizophrenia as defined in DSM-5:Occupation,Marital status,Episode duration, past psychiatric history, past medical history, family psychaitric history,past social history,etc. DSM-5 is a manual of psychiatric diseases diagnosis information.

In [1]:
#import needed tools
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import pandas as pd
import matplotlib.pyplot as pl
import numpy as np
import seaborn as sn

In [2]:
#Import and view data
data=pd.read_csv('used_PROJECT_DATANEW.csv')
data.head(5)

Unnamed: 0,Y_O_REP,AGE,SEX,OCCUP,MAR_STA,DUR_EPIS,P_PSY_HX,P_MED_HX,FAM_P_HX,P_SOC_HX,...,INT_GFK,INT_S_A_D,INT_CAL,INT_PROV,JUDGMT,INSIGHT,PSE,EEG,DIAGN,CLASS
0,2015,56,F,NURSE,WIDOW,0.5,RAPE,HYPTENSIVE,NO,YES,...,,,,,,PARTIAL,GOOD,NORMAL,PARANOID SCHIZ,SCHIZ
1,2016,28,F,UNEMPLOYED,MARRIED,48.0,,,,,...,,,,,POOR,POOR,GOOD,NORMAL,PARANOID SCHIZ,SCHIZ
2,2016,26,M,UNEMPLOYED,SINGLE,1.0,NO,SHORT-SIGHT,NO,NO,...,,,,,POOR,PARTIAL,GOOD,NORMAL,PARANOID SCHIZ,SCHIZ
3,2016,20,M,STUDENT,SINGLE,2.0,,,,YES,...,,,,,,,GOOD,NORMAL,PARANOID SCHIZ,SCHIZ
4,2016,34,F,UNEMPLOYED,SINGLE,6.0,,,,,...,,,,,,POOR,GOOD,NORMAL,PARANOID SCHIZ,SCHIZ


In [3]:
#Description of numerical data: duration of episode of the disease in patient in months.
#People don't report for medical attension on time as shown in average duration below
data=data.drop(['AGE','Y_O_REP','DIAGN'],axis=1)
data.describe()

Unnamed: 0,DUR_EPIS
count,148.0
mean,60.454392
std,94.876831
min,0.5
25%,2.0
50%,24.0
75%,60.0
max,384.0


In [4]:
#Check missing values
data.isnull().sum()

SEX            0
OCCUP          6
MAR_STA        2
DUR_EPIS       3
P_PSY_HX      17
P_MED_HX      30
FAM_P_HX      21
P_SOC_HX      13
P_SEX_HX      28
FOR_HX        20
PREMOBD_HX    28
MSE            0
SPEECH         7
MOOD           8
AFFECT        10
TH_FORM       43
TH_STRM       39
TH_CONTENT    27
TH_POSS       58
PERCEP        39
ORIENT        31
ATTEN         25
CONC          25
MEM_IR        35
MEM_ST        35
MEM_LT        35
INT_GFK       62
INT_S_A_D     64
INT_CAL       67
INT_PROV      67
JUDGMT        31
INSIGHT       20
PSE            4
EEG            0
CLASS          0
dtype: int64

In [5]:
#Replace missing values with mode values of corresponding column. It is assumed that majority values represent the reality.
imputer=SimpleImputer(missing_values=np.nan,strategy='most_frequent')
newdata=imputer.fit_transform(data)
pd.DataFrame(newdata).head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,F,NURSE,WIDOW,0.5,RAPE,HYPTENSIVE,NO,YES,NORMAL,NO,...,GOOD,GOOD,GOOD,GOOD,GOOD,POOR,PARTIAL,GOOD,NORMAL,SCHIZ
1,F,UNEMPLOYED,MARRIED,48.0,MENTAL ILLNESS,NO,NO,NO,NORMAL,NO,...,FAIR,GOOD,GOOD,GOOD,GOOD,POOR,POOR,GOOD,NORMAL,SCHIZ
2,M,UNEMPLOYED,SINGLE,1.0,NO,SHORT-SIGHT,NO,NO,NORMAL,YES,...,GOOD,GOOD,GOOD,GOOD,GOOD,POOR,PARTIAL,GOOD,NORMAL,SCHIZ
3,M,STUDENT,SINGLE,2.0,MENTAL ILLNESS,NO,NO,YES,NORMAL,NO,...,GOOD,GOOD,GOOD,GOOD,GOOD,POOR,POOR,GOOD,NORMAL,SCHIZ
4,F,UNEMPLOYED,SINGLE,6.0,MENTAL ILLNESS,NO,NO,NO,NORMAL,NO,...,GOOD,GOOD,GOOD,GOOD,GOOD,POOR,POOR,GOOD,NORMAL,SCHIZ


In [6]:
#Numerical Encoding of dataset using defined function
def encoder(X):
    X_target=X[:,-1]
    scaler=LabelEncoder()
    Y=scaler.fit_transform(X_target)
    scaler_Feature=OneHotEncoder(sparse=False)
    X=scaler_Feature.fit_transform(X[:,:-1])
    return X,Y
# apply function to the dataset
Feature_data,Target_data=encoder(newdata)

#split data function implementing stratified sampling
def datasplit(Feature,Target):
    Xtrain,Xtest,Ytrain,Ytest=train_test_split(Feature,Target,test_size=0.2,random_state=1,stratify=Target)
    return Xtrain,Xtest,Ytrain,Ytest

In [7]:
#split data into train-test portions
#apply split data function to dataset
Train_features,Test_features,Train_targets,Test_targets=datasplit(Feature_data,Target_data)
Train_features.shape,Test_features.shape,Train_targets.shape,Test_targets.shape

((120, 185), (31, 185), (120,), (31,))

In [8]:
#dimensional reduction of features
def red_ndim(data):
    dat=PCA(n_components=2).fit_transform(data)
    return data

red_train_data=red_ndim(Train_features)
red_test_data=red_ndim(Test_features)

In [9]:
#Train a Support Vector Machine classifier
pipeline=make_pipeline(SimpleImputer(missing_values=np.nan,strategy='most_frequent'),RandomForestClassifier(n_estimators=20,random_state=1))
Model_rdf=pipeline.fit(Train_features,Train_targets)
print('Training Accuracy = %.2f'%(Model_rdf.score(Train_features,Train_targets)))

Training Accuracy = 1.00


In [10]:
#validate with Cross-Validation scheme
num_folds = 10
accuracy_values =cross_val_score(Model_rdf,Train_features,Train_targets, scoring='accuracy', cv=num_folds)
print("Validation Accuracy: " + str(round(100*accuracy_values.mean(), 2)) + "%")
precision_values =cross_val_score(Model_rdf,Train_features,Train_targets, scoring='precision_weighted', cv=num_folds)
print("Precision: " + str(round(100*precision_values.mean(), 2)) + "%")
recall_values = cross_val_score(Model_rdf, Train_features,Train_targets, scoring='recall_weighted', cv=num_folds)
print("Recall: " + str(round(100*recall_values.mean(), 2)) + "%")
f1_values = cross_val_score(Model_rdf,Train_features,Train_targets, scoring='f1_weighted', cv=num_folds)
print("F1: " + str(round(100*f1_values.mean(), 2)) + "%")

Validation Accuracy: 93.09%
Precision: 95.01%
Recall: 93.09%
F1: 93.12%


In [11]:
#Test the model on unseen datset
y_pred=Model_rdf.predict(Test_features)
score=round((100*(y_pred==Test_targets).sum()/len(Test_features)),2)
print('Test Accuracy:' + str(score) + '%')

V={"Predicts":y_pred,'Targets':Test_targets}
pd.DataFrame(V).head(5)


Test Accuracy:100.0%


Unnamed: 0,Predicts,Targets
0,1,1
1,0,0
2,0,0
3,0,0
4,1,1


In [12]:
#predict a case
case=Test_features[2,:]
print('Predicated status: %s, Real status: %s'%(Model_rdf.predict(case[np.newaxis,:]),Test_targets[2]))


Predicated status: [0], Real status: 0


In [13]:
#save the best model for the prediction of Schizophrenia at 93% accuracy and recall 

from sklearn.externals import joblib
joblib.dump(Model_rdf,"SCHIZ_model.pkl" )

['SCHIZ_model.pkl']

Thanks!!!