In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline

data=pd.read_csv('drive/My Drive/adult_train_SMALLER.csv')
data.replace(' ?', np.NaN,inplace=True)     #Replacing all the missing values with NaN

data.columns = [
    "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
    "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
    "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"]   #for ease of human interpretation

In [None]:
#To show that the same information in Education is encoded in integer form in EducationNum
#Hence, we drop the categorical column- education

plot_educationnum=round(pd.crosstab(data.EducationNum,data.Income).div(pd.crosstab(data.EducationNum,data.Income).apply(sum,1),axis=0),2)
plot_educationnum.sort_values(by = ' >50K',inplace=True)
ax=plot_educationnum.plot(kind='bar',title='Distribution of income across various education levels',figsize=(10,8))

plot_education=round(pd.crosstab(data.Education,data.Income).div(pd.crosstab(data.Education,data.Income).apply(sum,1),axis=0),2)
plot_education.sort_values(by = ' >50K',inplace=True)
ax=plot_education.plot(kind='bar',title='Distribution of income across various education levels',figsize=(10,8))


In [None]:
training_data=data.copy()
# There are very less number of rows that contain missing columns. Hence we can safely adopt the below shown method.
training_data.dropna(axis=0,how='any',inplace=True)   #Dropping all the missing values (hence reduced training set)
training_label=training_data["Income"].map({' <=50K':0,' >50K':1})  #just to give binary labels
training_data.drop(["Income"],axis=1,inplace=True)
#training_data.shape
#data["Income"].unique()
training_data.drop(["Education"],axis=1,inplace=True)   #since it is a redundant feature


In [None]:
#Data visualization. Native Country versus the income. This can be repeated on all other features to understand
#what set of features help the model best in making the right predictions
plot_nativecountry=round(pd.crosstab(training_data.NativeCountry,data.Income).div(pd.crosstab(training_data.NativeCountry,data.Income).apply(sum,1),axis=0),2)
plot_nativecountry.sort_values(by = ' >50K',inplace=True)
ax=plot_nativecountry.plot(kind='bar',title='Distribution of income across Native Country',figsize=(10,8))


In [None]:
training_data.NativeCountry.value_counts(normalize=True) * 100  #Since we have 91.22% of US category, we understand that it's variance is very low and so we bin it into two groups
#One US bin and the other non-US bin
training_data['NativeCountry'] = [' United States' if i == ' United-States'  else ' Out of United States' for i in training_data['NativeCountry']]

In [None]:
#Data normalization on numerical columns

from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  

# Fitting only on training data. We then use the same mean and variance to normalize the test data.
scaler.fit(training_data.select_dtypes("int64"))  
train_data = scaler.transform(training_data.select_dtypes("int64"))  
train_data=pd.DataFrame(train_data)
train_data.columns = [
    "Age", "fnlwgt", "EducationNum",
    "CapitalGain", "CapitalLoss", "HoursPerWeek"]   #for ease of human interpretation

In [None]:
train_data.set_index(training_data.index,inplace=True)

In [None]:
# Data Preparation using One hot encoding

training_data_categorical = pd.get_dummies(training_data.select_dtypes('object'))
training_data_categorical=training_data_categorical.astype(dtype='category')
training_data_non_categorical = train_data                                           #training_data.select_dtypes(exclude = 'object')

training_data_onehotencoded = pd.concat([training_data_non_categorical, training_data_categorical], axis=1,join='inner')


In [None]:
#Feature selection: Select K best features

from sklearn.feature_selection import SelectKBest,chi2
np.seterr(divide='ignore',invalid='ignore')
kbest_selector=SelectKBest(k=training_data_onehotencoded.shape[1])

training_selected_features=kbest_selector.fit_transform(training_data_onehotencoded,training_label)

selected_cols = kbest_selector.get_support(indices=True)
selected_feature_names = training_data_onehotencoded.columns.values[selected_cols]

training_selected_features=pd.DataFrame(training_selected_features)

scores = kbest_selector.scores_[kbest_selector.get_support()]
selected_feature_names_scores = list(zip(selected_feature_names, scores))

Feat_F1score_combined = pd.DataFrame(data = selected_feature_names_scores, columns=['Feature_names', 'F_Scores'])
Feat_F1score_combined = Feat_F1score_combined.sort_values(['F_Scores', 'Feature_names'], ascending = [False, True])

In [None]:
Feat_F1score_combined.plot(x='Feature_names',y='F_Scores',kind='bar',title='Fscores of features arranged in accordance with their importance using SelectKBest method',figsize=(18,8))
#Setting the F score threshold as 30, we get a total of 30 features which have F scores beyond this value

kbest_selector=SelectKBest(k=30)
training_selected_features=kbest_selector.fit_transform(training_data_onehotencoded,training_label)

In [None]:
#Converting the dataframes to numpy array format
train_data_array=np.asarray(training_selected_features)
training_label_array=np.asarray(training_label)

In [None]:
# This portion of the code can be used when we want to do dimensionality reduction using PCA
'''
# Performing PCA

from sklearn.decomposition import PCA
pca = PCA()
pca.fit(training_selected_features)

training_features_transformed = pca.transform(training_selected_features)

from matplotlib import pyplot as plt

plt.plot(np.cumsum(pca.explained_variance_ratio_))   # As can be seen we take 30 components which captures almost all the variance
plt.xlabel('Number of components')
plt.ylabel('Cumulative variance')
plt.grid('True')
'''

'''
from sklearn.decomposition import PCA
pca = PCA(0.95)   #Select those many components that capture 95% of the variance 
pca.fit(training_selected_features)

training_features_transformed = pca.transform(training_selected_features)

train_data_array=np.asarray(training_features_transformed)
training_label_array=np.asarray(training_label)
'''

In [None]:
test_data=pd.read_csv('drive/My Drive/adult_test_SMALLER.csv')
test_data.replace(' ?', np.NaN,inplace=True)

test_data.columns = [
    "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
    "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
    "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"]   #for ease of human interpretation
testing_data=test_data
testing_data.dropna(axis=0,how='any',inplace=True)
testing_label=testing_data["Income"].map({' <=50K.':0,' >50K.':1})  #just to give binary labels
testing_data.drop(["Income"],axis=1,inplace=True)

testing_data.drop(["Education"],axis=1,inplace=True)   #since it is a redundant feature

# Applying same transformation to test data for normalization

test_data = scaler.transform(testing_data.select_dtypes("int64"))
test_data=pd.DataFrame(test_data)
test_data.columns = [
    "Age", "fnlwgt", "EducationNum",
    "CapitalGain", "CapitalLoss", "HoursPerWeek"]   #for ease of human interpretation

test_data.set_index(testing_data.index,inplace=True)

testing_data['NativeCountry'] = [' United States' if i == ' United-States'  else ' Out of United States' for i in testing_data['NativeCountry']]

# Data Prep using One hot encoding (on test data)

testing_data_categorical = pd.get_dummies(testing_data.select_dtypes('object'))
testing_data_categorical=testing_data_categorical.astype(dtype='category')
testing_data_non_categorical =   test_data                                         #testing_data.select_dtypes(exclude = 'object')

testing_data_onehotencoded = pd.concat([testing_data_non_categorical, testing_data_categorical], axis=1,join='inner')


#Matching the #of columns in the training data post one hot encoding

missing_test_col_set=set(training_data_onehotencoded.columns.values.tolist()).difference(testing_data_onehotencoded.columns.values.tolist())
#list(missing_test_col_set)
for i in range(len(list(missing_test_col_set))):
    testing_data_onehotencoded.loc[ : , list(missing_test_col_set)[i]] = 0
    
testing_selected_features=kbest_selector.transform(testing_data_onehotencoded)
# Below line of the code can be used when we want to do dimensionality reduction using PCA
#testing_features_transformed = pca.transform(testing_selected_features)


In [None]:
#Finding the optimal hyperparameters for SVM with linear kernel using cross validation
import sklearn                         
from sklearn import svm
from sklearn import model_selection,metrics
from statistics import mean

fold = model_selection.StratifiedKFold(n_splits = 5,shuffle = True)
C_val = np.logspace(-2,1,num=10)
gamma_val = np.logspace(-2,1,num=10)
size_gamma = np.size(gamma_val)
size_c = np.size(C_val)
optimal_gamma = -10 
optimal_C = -10
acc_max = -100


for i in range(0,size_gamma):
    for j in range(0,size_c):
        current_gamma = gamma_val[i]
        print(current_gamma)
        current_C = C_val[j]
        print(current_C)
        temp=[]
        for tr_idx,val_idx in fold.split(train_data_array,training_label_array):
            X_train, X_val= train_data_array[tr_idx],train_data_array[val_idx]
            y_train, y_val= training_label_array[tr_idx],training_label_array[val_idx]
            svm_clf=svm.SVC(gamma=current_gamma,C=current_C,kernel='linear')
            svm_clf.fit(X_train,y_train)
            y_pred=svm_clf.predict(X_val)
            acc_val = metrics.accuracy_score(y_val,y_pred)
            print(acc_val)
            temp.append(acc_val)
            
        if  mean(temp) > acc_max:
            acc_max = mean(temp)
            optimal_gamma = gamma_val[i]
            optimal_C = C_val[j]
print("Training Accuracy : ",acc_max)
print("Optimal Hyper parameters : gamma : ",optimal_gamma," C:",optimal_C)


In [None]:
#Testing using penalized SVM classifier with linear kernel

svm_clf=svm.SVC(kernel='linear',C=optimal_C,gamma=optimal_gamma,class_weight='balanced')
svm_clf.fit(training_selected_features,training_label)

predicted_label = svm_clf.predict(testing_selected_features)
testing_accuracy=metrics.accuracy_score(testing_label,predicted_label)
print(testing_accuracy)
#report_an=metrics.precision_recall_fscore_support(testing_label,predicted_label,average='binary')  
#print(report_an)
report=metrics.classification_report(testing_label,predicted_label)
print(report)

auc_score=metrics.roc_auc_score(testing_label,predicted_label,average='macro')
print(auc_score)

In [None]:
#Testing using SVM classifier with linear kernel

svm_clf=svm.SVC(kernel='linear',C=optimal_C,gamma=optimal_gamma)
svm_clf.fit(training_selected_features,training_label)

predicted_label = svm_clf.predict(testing_selected_features)
testing_accuracy=metrics.accuracy_score(testing_label,predicted_label)
print(testing_accuracy)
#report_an=metrics.precision_recall_fscore_support(testing_label,predicted_label,average='binary')  
#print(report_an)
report=metrics.classification_report(testing_label,predicted_label)
print(report)

auc_score=metrics.roc_auc_score(testing_label,predicted_label,average='macro')
print(auc_score)

In [None]:
# To visualize the performance difference between normal and weighted(penalized) SVM with linear kernel
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.datasets import make_blobs

# we create two clusters of random points
n_samples_1 = training_label.value_counts().iloc[0]
n_samples_2 = training_label.value_counts().iloc[1]
centers = [[0.0, 0.0], [2.0, 2.0]]
clusters_std = [1.5, 0.5]
X, y = make_blobs(n_samples=[n_samples_1, n_samples_2],
                  centers=centers,
                  cluster_std=clusters_std,
                  random_state=0, shuffle=False)

# fit the model and get the separating hyperplane
clf = svm.SVC(kernel='linear', C=optimal_C,gamma=optimal_gamma)
clf.fit(X, y)

# fit the model and get the separating hyperplane using weighted classes
wclf = svm.SVC(kernel='linear', class_weight='balanced',C=optimal_C,gamma=optimal_gamma)
wclf.fit(X, y)

# plot the samples
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired, edgecolors='k')

# plot the decision functions for both classifiers
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()

# create grid to evaluate model
xx = np.linspace(xlim[0], xlim[1], 30)
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy, xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T

# get the separating hyperplane
Z = clf.decision_function(xy).reshape(XX.shape)

# plot decision boundary and margins
a = ax.contour(XX, YY, Z, colors='k', levels=[0], alpha=0.5, linestyles=['-'])

# get the separating hyperplane for weighted classes
Z = wclf.decision_function(xy).reshape(XX.shape)

# plot decision boundary and margins for weighted classes
b = ax.contour(XX, YY, Z, colors='r', levels=[0], alpha=0.5, linestyles=['-'])

plt.legend([a.collections[0], b.collections[0]], ["non weighted", "weighted"],
           loc="upper right")
plt.show()

#This portion of the code has been taken from the scikit-learn documnetation examples

In [None]:
#Finding the optimal hyperparameters for SVM with rbf kernel using cross validation
import sklearn                          
from sklearn import svm
from sklearn import model_selection,metrics
from statistics import mean

fold = model_selection.StratifiedKFold(n_splits = 5,shuffle = True)
C_val = np.logspace(-2,1,num=10)
gamma_val = np.logspace(-2,1,num=10)
size_gamma = np.size(gamma_val)
size_c = np.size(C_val)
optimal_gamma = -10 
optimal_C = -10
acc_max = -100


for i in range(0,size_gamma):
    for j in range(0,size_c):
        current_gamma = gamma_val[i]
        print(current_gamma)
        current_C = C_val[j]
        print(current_C)
        temp=[]
        for tr_idx,val_idx in fold.split(train_data_array,training_label_array):
            X_train, X_val= train_data_array[tr_idx],train_data_array[val_idx]
            y_train, y_val= training_label_array[tr_idx],training_label_array[val_idx]
            svm_clf=svm.SVC(gamma=current_gamma,C=current_C,kernel='rbf')
            svm_clf.fit(X_train,y_train)
            y_pred=svm_clf.predict(X_val)
            acc_val = metrics.accuracy_score(y_val,y_pred)
            print(acc_val)
            temp.append(acc_val)
            
        if  mean(temp) > acc_max:
            acc_max = mean(temp)
            optimal_gamma = gamma_val[i]
            optimal_C = C_val[j]
print("Training Accuracy : ",acc_max)
print("Optimal Hyper parameters : gamma : ",optimal_gamma," C:",optimal_C)


In [None]:
#Testing using SVM classifier with gaussian kernel

svm_clf=svm.SVC(kernel='rbf',C=optimal_C,gamma=optimal_gamma)
svm_clf.fit(training_selected_features,training_label)

predicted_label = svm_clf.predict(testing_selected_features)

testing_accuracy=metrics.accuracy_score(testing_label,predicted_label)
print(testing_accuracy)
#report_an=metrics.precision_recall_fscore_support(testing_label,predicted_label,average='binary')  
#print(report_an)
report=metrics.classification_report(testing_label,predicted_label)
print(report)

auc_score=metrics.roc_auc_score(testing_label,predicted_label,average='macro')
print(auc_score)

In [None]:
#Testing using weighted(penalized) SVM classifier with gaussian kernel 

svm_clf=svm.SVC(kernel='rbf',C=optimal_C,gamma=optimal_gamma,class_weight='balanced')
svm_clf.fit(training_selected_features,training_label)

predicted_label = svm_clf.predict(testing_selected_features)
testing_accuracy=metrics.accuracy_score(testing_label,predicted_label)
print(testing_accuracy)
#report_an=metrics.precision_recall_fscore_support(testing_label,predicted_label,average='binary')  
#print(report_an)
report=metrics.classification_report(testing_label,predicted_label)
print(report)

auc_score=metrics.roc_auc_score(testing_label,predicted_label,average='macro')
print(auc_score)

In [None]:
# To visualize the performance difference between normal and weighted(penalized) SVM with gaussian (rbf) kernel

import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.datasets import make_blobs

# we create two clusters of random points
n_samples_1 = training_label.value_counts().iloc[0]
n_samples_2 = training_label.value_counts().iloc[1]
centers = [[0.0, 0.0], [2.0, 2.0]]
clusters_std = [1.5, 0.5]
X, y = make_blobs(n_samples=[n_samples_1, n_samples_2],
                  centers=centers,
                  cluster_std=clusters_std,
                  random_state=0, shuffle=False)

# fit the model and get the separating hyperplane
clf = svm.SVC(kernel='rbf', C=optimal_C,gamma=optimal_gamma)
clf.fit(X, y)

# fit the model and get the separating hyperplane using weighted classes
wclf = svm.SVC(kernel='rbf', class_weight='balanced',C=optimal_C,gamma=optimal_gamma)
wclf.fit(X, y)

# plot the samples
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired, edgecolors='k')

# plot the decision functions for both classifiers
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()

# create grid to evaluate model
xx = np.linspace(xlim[0], xlim[1], 30)
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy, xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T

# get the separating hyperplane
Z = clf.decision_function(xy).reshape(XX.shape)

# plot decision boundary and margins
a = ax.contour(XX, YY, Z, colors='k', levels=[0], alpha=0.5, linestyles=['-'])

# get the separating hyperplane for weighted classes
Z = wclf.decision_function(xy).reshape(XX.shape)

# plot decision boundary and margins for weighted classes
b = ax.contour(XX, YY, Z, colors='y', levels=[0], alpha=0.5, linestyles=['-'])

plt.legend([a.collections[0], b.collections[0]], ["non weighted", "weighted"],
           loc="upper right")
plt.show()

#This portion of the code has been taken from the scikit-learn documnetation examples

In [None]:
#FInd best parameters for SVM with 'polynomial' kernel using cross validation 
import sklearn                          
from sklearn import svm
from sklearn import model_selection,metrics
from statistics import mean

fold = model_selection.StratifiedKFold(n_splits = 5,shuffle = True)
C_val = np.logspace(-2,1,num=10)
gamma_val = np.logspace(-2,1,num=10)
size_gamma = np.size(gamma_val)
size_c = np.size(C_val)
optimal_gamma = -10 
optimal_C = -10
acc_max = -100


for i in range(0,size_gamma):
    for j in range(0,size_c):
        current_gamma = gamma_val[i]
        print(current_gamma)
        current_C = C_val[j]
        print(current_C)
        temp=[]
        for tr_idx,val_idx in fold.split(train_data_array,training_label_array):
            X_train, X_val= train_data_array[tr_idx],train_data_array[val_idx]
            y_train, y_val= training_label_array[tr_idx],training_label_array[val_idx]
            svm_clf=svm.SVC(gamma=current_gamma,C=current_C,kernel='poly')
            svm_clf.fit(X_train,y_train)
            y_pred=svm_clf.predict(X_val)
            acc_val = metrics.accuracy_score(y_val,y_pred)
            print(acc_val)
            temp.append(acc_val)
            
        if  mean(temp) > acc_max:
            acc_max = mean(temp)
            optimal_gamma = gamma_val[i]
            optimal_C = C_val[j]
print("Training Accuracy : ",acc_max)
print("Optimal Hyper parameters : gamma : ",optimal_gamma," C:",optimal_C)


In [None]:
#Testing using SVM classifier with polynomial kernel

svm_clf=svm.SVC(kernel='poly',C=optimal_C,gamma=optimal_gamma)
svm_clf.fit(training_selected_features,training_label)

predicted_label = svm_clf.predict(testing_selected_features)
testing_accuracy=metrics.accuracy_score(testing_label,predicted_label)
print(testing_accuracy)
#report_an=metrics.precision_recall_fscore_support(testing_label,predicted_label,average='binary')  
#print(report_an)
report=metrics.classification_report(testing_label,predicted_label)
print(report)

auc_score=metrics.roc_auc_score(testing_label,predicted_label,average='macro')
print(auc_score)

In [None]:
#The cross validation part on reduced dataset
import sklearn                          
from sklearn.neural_network import MLPClassifier
from sklearn import model_selection,metrics
from statistics import mean

#FInd best parameters for MLP using cross validation 

folds = model_selection.StratifiedKFold(n_splits = 5,shuffle = True)
hidden_layers_val = np.arange(1,100)
acc_max = -1000
hidden_layer_opt = -10

for i in hidden_layers_val:
    print(i)
    temp=[]
    for tr_ind,v_ind in folds.split(train_data_array,training_label_array):
        X_train, X_val = train_data_array[tr_ind],train_data_array[v_ind]
        y_train, y_val = training_label_array[tr_ind],training_label_array[v_ind]
        mlp_clf = MLPClassifier(hidden_layer_sizes=i,max_iter=1000)
        mlp_clf.fit(X_train,y_train)
        y_pred = mlp_clf.predict(X_val)
        acc_val = metrics.accuracy_score(y_val,y_pred)
        print(acc_val)
        temp.append(acc_val)
    if mean(temp) > acc_max:
        acc_max = mean(temp)
        hidden_layer_opt = i
                         
print("Training Accuracy : ",acc_max)                    
print("Optimal Value : Hidden layer size : ",hidden_layer_opt)
   


In [None]:
#Using the optimal parameters to train and test the MLP Classifier

import sklearn                          
from sklearn.neural_network import MLPClassifier
from sklearn import model_selection,metrics
from statistics import mean


hidden_layer_opt=18
#Testing using MLP classifier 

mlp_clf=MLPClassifier(hidden_layer_sizes=hidden_layer_opt)
mlp_clf.fit(training_selected_features,training_label)

predicted_label = mlp_clf.predict(testing_selected_features)
testing_accuracy=metrics.accuracy_score(testing_label,predicted_label)
print(testing_accuracy)
#report_an=metrics.precision_recall_fscore_support(testing_label,predicted_label,average='binary')  
#print(report_an)
report=metrics.classification_report(testing_label,predicted_label)
print(report)

auc_score=metrics.roc_auc_score(testing_label,predicted_label,average='macro')
print(auc_score)

In [None]:
#Using Naive Bayes (Gaussian) classifier 

import sklearn                          
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection,metrics
from statistics import mean

nb_clf=GaussianNB()
nb_clf.fit(training_selected_features,training_label)

predicted_label = nb_clf.predict(testing_selected_features)
testing_accuracy=metrics.accuracy_score(testing_label,predicted_label)
print(testing_accuracy)
#report_an=metrics.precision_recall_fscore_support(testing_label,predicted_label,average='binary')  
#print(report_an)
report=metrics.classification_report(testing_label,predicted_label)
print(report)

auc_score=metrics.roc_auc_score(testing_label,predicted_label,average='macro')
print(auc_score)

In [None]:
#FInd best parameters for Decision Tree using cross validation 

import sklearn                          
from sklearn.tree import DecisionTreeClassifier
from sklearn import model_selection,metrics
from statistics import mean

folds = model_selection.StratifiedKFold(n_splits = 5,shuffle = True)
depth_val = np.arange(1,100)
acc_max = -1000
opt_dep = -10
for i in range(0,np.size(depth_val)):
        temp=[]
       # print(i)
       # n_trial = depth_val[i]
        for tr_ind,v_ind in folds.split(train_data_array,training_label_array):
            X_train, X_val = train_data_array[tr_ind],train_data_array[v_ind]
            y_train, y_val = training_label_array[tr_ind],training_label_array[v_ind]
            dectree_clf = DecisionTreeClassifier(max_depth = depth_val[i])
            dectree_clf.fit(X_train,y_train)
            y_pred = dectree_clf.predict(X_val)
            acc_val = metrics.accuracy_score(y_val,y_pred)
            temp.append(acc_val)
            if mean(temp) > acc_max:
                acc_max = mean(temp)
                opt_dep = depth_val[i]
print("Training Accuracy : ",acc_max)                    
print("Optimal Value : Max depth : ",opt_dep)
    


In [None]:
#Decision Tree on test data

dectree_clf=DecisionTreeClassifier(max_depth = opt_dep)
dectree_clf.fit(training_selected_features,training_label)

predicted_label = dectree_clf.predict(testing_selected_features)
testing_accuracy=metrics.accuracy_score(testing_label,predicted_label)
print(testing_accuracy)
#report_an=metrics.precision_recall_fscore_support(testing_label,predicted_label,average='binary')  
#print(report_an)
report=metrics.classification_report(testing_label,predicted_label)
print(report)

auc_score=metrics.roc_auc_score(testing_label,predicted_label,average='macro')
print(auc_score)

In [None]:
#FInd best parameters for Random Forest using cross validation 

import sklearn                          
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection,metrics
from statistics import mean

folds = model_selection.StratifiedKFold(n_splits = 5,shuffle = True)
estimators_num = np.arange(1,100)
acc_max = -1000
opt_est = -10
for i in range(0,np.size(estimators_num)):
        temp=[]
        #n_trial = g[i]
        for tr_ind,v_ind in folds.split(train_data_array,training_label_array):
            X_train, X_val = train_data_array[tr_ind],train_data_array[v_ind]
            y_train, y_val = training_label_array[tr_ind],training_label_array[v_ind]
            rand_forest_clf = RandomForestClassifier(n_estimators = estimators_num[i])
            rand_forest_clf.fit(X_train,y_train)
            y_pred = rand_forest_clf.predict(X_val)
            acc_val = metrics.accuracy_score(y_val,y_pred)
            temp.append(acc_val)
            if mean(temp) > acc_max:
                acc_max = mean(temp)
                opt_est = estimators_num[i]
print("Training Accuracy : ",acc_max)                    
print("Optimal Value : No.of estimators : ",opt_est)
    

In [None]:
#Random Forest on test data

rand_forest_clf=RandomForestClassifier(n_estimators = opt_est)
rand_forest_clf.fit(training_selected_features,training_label)

predicted_label = rand_forest_clf.predict(testing_selected_features)
testing_accuracy=metrics.accuracy_score(testing_label,predicted_label)
print(testing_accuracy)
#report_an=metrics.precision_recall_fscore_support(testing_label,predicted_label,average='binary')  
#print(report_an)
report=metrics.classification_report(testing_label,predicted_label)
print(report)

auc_score=metrics.roc_auc_score(testing_label,predicted_label,average='macro')
print(auc_score)

In [None]:
#FInd best parameters for KNN using cross validation 

import sklearn                          
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection,metrics
from statistics import mean

folds = model_selection.StratifiedKFold(n_splits = 5,shuffle = True)
neighbors_num = np.arange(1,25)
acc_max = -1000
neighbors_opt = -10
for i in range(0,np.size(neighbors_num)):
        #n_trial = g[i]
        temp=[]
        for tr_ind,v_ind in folds.split(train_data_array,training_label_array):
            X_train, X_val = train_data_array[tr_ind],train_data_array[v_ind]
            y_train, y_val = training_label_array[tr_ind],training_label_array[v_ind]
            knn_clf = KNeighborsClassifier(n_neighbors = neighbors_num[i])
            knn_clf.fit(X_train,y_train)
            y_pred = knn_clf.predict(X_val)
            acc_val = metrics.accuracy_score(y_val,y_pred)
            temp.append(acc_val)
            if mean(temp) > acc_max:
                acc_max = mean(temp)
                neighbors_opt = neighbors_num[i]
                        
print("Optimal Value : No.of neighbors : ",neighbors_opt)
print("Training Accuracy : ",acc_max)
    

In [None]:
#KNN on test data

knn_clf=KNeighborsClassifier(n_neighbors = neighbors_opt)
knn_clf.fit(training_selected_features,training_label)

predicted_label = knn_clf.predict(testing_selected_features)
testing_accuracy=metrics.accuracy_score(testing_label,predicted_label)
print(testing_accuracy)
#report_an=metrics.precision_recall_fscore_support(testing_label,predicted_label,average='binary')  
#print(report_an)
report=metrics.classification_report(testing_label,predicted_label)
print(report)

auc_score=metrics.roc_auc_score(testing_label,predicted_label,average='macro')
print(auc_score)