In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
import pickle
from preprocess import clean,split

In [2]:
def clean(filename):
    df=pd.read_csv(filename)

    df = df.loc[df['Visit']==1] # consider only patients with 1 visit. 
    
    df['M/F'] = df['M/F'].replace(['F','M'], [0,1]) # represent Male as 1 and Female as 0.
    df['Group'] = df['Group'].replace(['Converted'], ['Demented']) # some patients converted to Demented.
    df['Group'] = df['Group'].replace(['Demented', 'Nondemented'], [1,0]) # represent Demented as 1 and Non as 0.

    df=df.dropna(axis=0,how='any') # drop rows with missing SES(socio-economic status) value.
    df = df.drop(['MRI ID', 'Visit', 'Hand'], axis=1) # Drop unnecessary columns
    df = df.reset_index(drop=True)
    return df
    


In [3]:
def split(df):

    Y = df['Group'].values # Logistic target for model
    X = df[['M/F', 'Age', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF']] # Features used for prediction.

    X_train,X_test,Y_train,Y_test = train_test_split(X, Y, random_state=0)

    # feature scaling
    scaler = MinMaxScaler().fit(X_train) # scales values between 0 and 1.
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled  = scaler.transform(X_test)

    return X_train_scaled,X_test_scaled,Y_train,Y_test
    

if __name__=='__main__':
    print(clean('oasis_longitudinal.csv').head())    


  Subject ID  Group  MR Delay  M/F  Age  EDUC  SES  MMSE  CDR  eTIV   nWBV  \
0  OAS2_0001      0         0    1   87    14  2.0  27.0  0.0  1987  0.696   
1  OAS2_0004      0         0    0   88    18  3.0  28.0  0.0  1215  0.710   
2  OAS2_0005      0         0    1   80    12  4.0  28.0  0.0  1689  0.712   
3  OAS2_0008      0         0    0   93    14  2.0  30.0  0.0  1272  0.698   
4  OAS2_0009      1         0    1   68    12  2.0  27.0  0.5  1457  0.806   

     ASF  
0  0.883  
1  1.444  
2  1.039  
3  1.380  
4  1.205  


In [4]:
cleaned_df=clean('oasis_longitudinal.csv')
X_train,X_test,Y_train,Y_test=split(cleaned_df)

logistic_model=LogisticRegression(C=10).fit(X_train,Y_train)
forest_model = RandomForestClassifier(n_estimators=3, max_features=4, n_jobs=4, max_depth=5, random_state=0).fit(X_train,Y_train)
tree_model = DecisionTreeClassifier(random_state=0, max_depth=1, criterion='gini').fit(X_train,Y_train)
adaboost_model=AdaBoostClassifier(n_estimators=3, learning_rate=0.0001, random_state=0).fit(X_train,Y_train)

pickle.dump(logistic_model, open('model_files/logistic.sav', 'wb'))
pickle.dump(forest_model,open('model_files/forest.sav','wb'))
pickle.dump(tree_model,open('model_files/tree.sav','wb'))
pickle.dump(adaboost_model,open('model_files/adaboost.sav','wb'))


In [5]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, roc_curve, auc, f1_score, precision_score
import pickle
from preprocess import clean,split

def score(filename,disp=True):
    cleaned_df=clean('oasis_longitudinal.csv')
    _,X_test,_,Y_test=split(cleaned_df)

    model=pickle.load(open(filename, 'rb'))
    Y_pred=model.predict(X_test)
    recall=recall_score(Y_test,Y_pred)
    accuracy=accuracy_score(Y_test,Y_pred)
    f1score = f1_score(Y_test,Y_pred)
    pre = precision_score(Y_test,Y_pred)
    if disp:
        print(model)
        print(f"Accuracy = {accuracy}")
        print(f"Recall = {recall}")
        print(f"f1_score = {f1score}")
        print(f"precesion = {pre}")
    return model
    
if __name__=="__main__":
    # load the trained model. if you don't want to display the scores of the model set the argument disp=False.
    model=score('model_files/logistic.sav')

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
Accuracy = 0.8055555555555556
Recall = 0.9411764705882353
f1_score = 0.8205128205128205
precesion = 0.7272727272727273
