In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from random import randint
%matplotlib inline 
import warnings
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split



In [5]:
warnings.filterwarnings("ignore")


def split(df,label):
    X_tr, X_te, Y_tr, Y_te = train_test_split(df, label, test_size=0.25, random_state=42)
    return X_tr, X_te, Y_tr, Y_te


classifiers = ['LinearSVM', 'RadialSVM', 
               'Logistic',  'RandomForest', 
               'AdaBoost',  'DecisionTree', 
               'KNeighbors','GradientBoosting']

models = [svm.SVC(kernel='linear'),
          svm.SVC(kernel='rbf'),
          LogisticRegression(max_iter = 1000),
          RandomForestClassifier(n_estimators=200, random_state=0),
          AdaBoostClassifier(random_state = 0),
          DecisionTreeClassifier(random_state=0),
          KNeighborsClassifier(),
          GradientBoostingClassifier(random_state=0)]


def acc_score(df,label):
    Score = pd.DataFrame({"Classifier":classifiers})
    j = 0
    acc = []
    X_train,X_test,Y_train,Y_test = split(df,label)
    for i in models:
        model = i
        model.fit(X_train,Y_train)
        predictions = model.predict(X_test)
        acc.append(accuracy_score(Y_test,predictions))
        j = j+1     
    Score["Accuracy"] = acc
    Score.sort_values(by="Accuracy", ascending=False,inplace = True)
    Score.reset_index(drop=True, inplace=True)
    return Score

# def plot(score,x,y,c = "b"):
#     gen = [1,2,3,4,5]
#     plt.figure(figsize=(6,4))
#     ax = sns.pointplot(x=gen, y=score,color = c )
#     ax.set(xlabel="Generation", ylabel="Accuracy")
#     ax.set(ylim=(x,y))
    
def plot(score,x,y,c = "b"):
    gen = [1,2,3,4,5,6,7,8,9,10]
    plt.figure(figsize=(6,4))
    ax = sns.pointplot(x=gen, y=score,color = c )
    ax.set(xlabel="Generation", ylabel="Accuracy")
    ax.set(ylim=(x,y))
    

In [8]:
def initilization_of_population(size,n_feat):
    population = []
    for i in range(size):
        chromosome = np.ones(n_feat,dtype=np.bool)     
        chromosome[:int(0.3*n_feat)]=False             
        np.random.shuffle(chromosome)
        population.append(chromosome)
    return population


def fitness_score(population):
    scores = []
    for chromosome in population:
        logmodel2.fit(X_train.iloc[:,chromosome],Y_train)         
        predictions = logmodel2.predict(X_test.iloc[:,chromosome])
        scores.append(accuracy_score(Y_test,predictions))
    scores, population = np.array(scores), np.array(population) 
    inds = np.argsort(scores)                                    
    return list(scores[inds][::-1]), list(population[inds,:][::-1]) 


def selection(pop_after_fit,n_parents):
    population_nextgen = []
    for i in range(n_parents):
        population_nextgen.append(pop_after_fit[i])
    return population_nextgen


def crossover(pop_after_sel):
    pop_nextgen = pop_after_sel
    for i in range(0,len(pop_after_sel),2):
        new_par = []
        child_1 , child_2 = pop_nextgen[i] , pop_nextgen[i+1]
        new_par = np.concatenate((child_1[:len(child_1)//2],child_2[len(child_1)//2:]))
        pop_nextgen.append(new_par)
    return pop_nextgen


def mutation(pop_after_cross,mutation_rate,n_feat):   
    mutation_range = int(mutation_rate*n_feat)
    pop_next_gen = []
    for n in range(0,len(pop_after_cross)):
        chromo = pop_after_cross[n]
        rand_posi = [] 
        for i in range(0,mutation_range):
            pos = randint(0,n_feat-1)
            rand_posi.append(pos)
        for j in rand_posi:
            chromo[j] = not chromo[j]  
        pop_next_gen.append(chromo)
    return pop_next_gen

def generations(df,label,size,n_feat,n_parents,mutation_rate,n_gen,X_train,
                                   X_test, Y_train, Y_test):
    best_chromo= []
    best_score= []
    population_nextgen=initilization_of_population(size,n_feat)
    for i in range(n_gen):
        scores, pop_after_fit = fitness_score(population_nextgen)
        print('Best score in generation',i+1,':',scores[:1])  #2
        pop_after_sel = selection(pop_after_fit,n_parents)
        pop_after_cross = crossover(pop_after_sel)
        population_nextgen = mutation(pop_after_cross,mutation_rate,n_feat)
        best_chromo.append(pop_after_fit[0])
        best_score.append(scores[0])
    return best_chromo,best_score

In [9]:
data_bc = pd.read_csv("/kaggle/input/csv-files/dataa_500.csv")
label_bc = data_bc["class"]
data_bc.drop(["id", "class"],axis = 1,inplace = True)

In [10]:
score2 = acc_score(data_bc,label_bc)
score2

Unnamed: 0,Classifier,Accuracy
0,RandomForest,0.972028
1,LinearSVM,0.965035
2,Logistic,0.965035
3,KNeighbors,0.958042
4,GradientBoosting,0.958042
5,AdaBoost,0.951049
6,DecisionTree,0.951049
7,RadialSVM,0.944056


In [9]:
score2 = acc_score(data_bc2,label_bc2)
score2

NameError: name 'data_bc2' is not defined

In [None]:
score1 = acc_score(data_bc2,label_bc2)
score1

In [None]:
# logmodel = AdaBoostClassifier(n_estimators=200, random_state=0)
X_train,X_test, Y_train, Y_test = split(data_bc2,label_bc2)
# chromo_df_bc,score_bc=generations(data_bc,label_bc,size=80,n_feat=data_bc.shape[1],n_parents=64,mutation_rate=0.20,n_gen=5,
#                          X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

In [None]:
logmodel2 = LogisticRegression(max_iter = 1000)
X_train,X_test, Y_train, Y_test = split(data_bc,label_bc)
chromo_df_bc,score_bc=generations(data_bc,label_bc,size=80,n_feat=data_bc.shape[1],n_parents=64,mutation_rate=0.20,n_gen=10,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

In [None]:
logmodel2 = AdaBoostClassifier(random_state=0)
X_train,X_test, Y_train, Y_test = split(data_bc,label_bc)
chromo_df_bc,score_bc=generations(data_bc,label_bc,size=80,n_feat=data_bc.shape[1],n_parents=64,mutation_rate=0.20,n_gen=10,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

In [10]:
#600_features

logmodel2 = RandomForestClassifier(n_estimators=200, random_state=0)
X_train,X_test, Y_train, Y_test = split(data_bc,label_bc)
chromo_df_bc,score_bc=generations(data_bc,label_bc,size=80,n_feat=data_bc.shape[1],n_parents=64,mutation_rate=0.20,n_gen=5,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

Best score in generation 1 : [0.993006993006993]
Best score in generation 2 : [0.986013986013986]
Best score in generation 3 : [0.993006993006993]
Best score in generation 4 : [0.993006993006993]
Best score in generation 5 : [0.993006993006993]


In [None]:
logmodel2 = RandomForestClassifier(n_estimators=200, random_state=0)
X_train,X_test, Y_train, Y_test = split(data_bc,label_bc)
chromo_df_bc,score_bc=generations(data_bc,label_bc,size=80,n_feat=data_bc.shape[1],n_parents=64,mutation_rate=0.20,n_gen=5,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

In [None]:
#500_features

logmodel2 = RandomForestClassifier(n_estimators=200, random_state=0)
X_train,X_test, Y_train, Y_test = split(data_bc,label_bc)
chromo_df_bc,score_bc=generations(data_bc,label_bc,size=80,n_feat=data_bc.shape[1],n_parents=64,mutation_rate=0.20,n_gen=10,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

In [None]:
##500_features

logmodel2 = RandomForestClassifier(n_estimators=200, random_state=0)
X_train,X_test, Y_train, Y_test = split(data_bc,label_bc)
chromo_df_bc,score_bc=generations(data_bc,label_bc,size=80,n_feat=data_bc.shape[1],n_parents=64,mutation_rate=0.20,n_gen=5,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

In [None]:
logmodel2 = RandomForestClassifier(n_estimators=200, random_state=0)
X_train,X_test, Y_train, Y_test = split(data_bc,label_bc)
chromo_df_bc,score_bc=generations(data_bc,label_bc,size=80,n_feat=data_bc.shape[1],n_parents=64,mutation_rate=0.20,n_gen=5,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

In [None]:
logmodel2 = RandomForestClassifier(n_estimators=200, random_state=0)
X_train,X_test, Y_train, Y_test = split(data_bc,label_bc)
chromo_df_bc,score_bc=generations(data_bc,label_bc,size=80,n_feat=data_bc.shape[1],n_parents=64,mutation_rate=0.20,n_gen=10,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

In [None]:
plot2(score_bc,0.90,1,2.5,0.91,c = "gold")

In [None]:
logmodel2 = RandomForestClassifier(n_estimators=200, random_state=0)
X_train,X_test, Y_train, Y_test = split(data_bc,label_bc)
chromo_df_bc,score_bc=generations(data_bc,label_bc,size=80,n_feat=data_bc.shape[1],n_parents=64,mutation_rate=0.20,n_gen=10,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

In [None]:
logmodel2 = RandomForestClassifier(n_estimators=200, random_state=0)
X_train,X_test, Y_train, Y_test = split(data_bc,label_bc)
chromo_df_bc,score_bc=generations(data_bc,label_bc,size=80,n_feat=data_bc.shape[1],n_parents=64,mutation_rate=0.20,n_gen=10,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

In [None]:
logmodel2 = RandomForestClassifier(n_estimators=200, random_state=0)
X_train,X_test, Y_train, Y_test = split(data_bc,label_bc)
chromo_df_bc,score_bc=generations(data_bc,label_bc,size=80,n_feat=data_bc.shape[1],n_parents=64,mutation_rate=0.20,n_gen=10,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

In [None]:
logmodel2 = RandomForestClassifier(n_estimators=200, random_state=0)
X_train,X_test, Y_train, Y_test = split(data_bc,label_bc)
chromo_df_bc,score_bc=generations(data_bc,label_bc,size=80,n_feat=data_bc.shape[1],n_parents=64,mutation_rate=0.20,n_gen=10,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

In [None]:
logmodel2 = DecisionTreeClassifier(random_state=0)
X_train,X_test, Y_train, Y_test = split(data_bc,label_bc)
chromo_df_bc,score_bc=generations(data_bc,label_bc,size=80,n_feat=data_bc.shape[1],n_parents=64,mutation_rate=0.20,n_gen=10,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

In [None]:

logmodel2 = KNeighborsClassifier()
X_train,X_test, Y_train, Y_test = split(data_bc,label_bc)
chromo_df_bc,score_bc=generations(data_bc,label_bc,size=80,n_feat=data_bc.shape[1],n_parents=64,mutation_rate=0.20,n_gen=10,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

In [None]:
logmodel2 = svm.SVC(kernel='rbf')
X_train,X_test, Y_train, Y_test = split(data_bc,label_bc)
chromo_df_bc,score_bc=generations(data_bc,label_bc,size=80,n_feat=data_bc.shape[1],n_parents=64,mutation_rate=0.20,n_gen=10,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

In [None]:
logmodel2 = svm.SVC(kernel='linear')
X_train,X_test, Y_train, Y_test = split(data_bc,label_bc)
chromo_df_bc,score_bc=generations(data_bc,label_bc,size=80,n_feat=data_bc.shape[1],n_parents=64,mutation_rate=0.20,n_gen=10,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

In [None]:
logmodel2 = DecisionTreeClassifier(random_state=0)
X_train,X_test, Y_train, Y_test = split(data_bc,label_bc)
chromo_df_bc,score_bc=generations(data_bc,label_bc,size=80,n_feat=data_bc.shape[1],n_parents=64,mutation_rate=0.20,n_gen=10,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

### 3. Visualization

In [None]:
plot(score_bc,0.9,1.0,c = "gold")

In [11]:
print(logmodel2.score(X_test, Y_test))

ValueError: X has 32 features, but RandomForestClassifier is expecting 16 features as input.

In [None]:
from tpot import TPOTClassifier

In [None]:
from tpot import TPOTClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
digits = load_digits()
X_train, X_test, Y_train, Y_test = train_test_split(digits.data, digits.target,
#                                                     train_size=0.75, test_size=0.25)

tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2)
tpot.fit(X_train, Y_train)
print(tpot.score(X_test, Y_test))

In [None]:
y_pred=logmodel2.predict(X_train)

In [None]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [10,11,12,13,14,15,16]
max_depth = [5,10,20,30,40,50,60]
criterion=['entropy', 'gini']
min_samples_leaf=[1, 2, 5, 10]
min_samples_split=[2, 5, 10, 15]
max_features = ['auto', 'sqrt','log2']


param = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(param)


In [None]:
from tpot import TPOTClassifier


tpot_classifier = TPOTClassifier(generations= 5, population_size= 24, offspring_size= 12,
                                 verbosity= 2, early_stop= 12,
                                 config_dict={'sklearn.ensemble.RandomClassifier': param}, 
                                 cv = 4, scoring = 'accuracy')
tpot_classifier.fit(X_train,Y_train)

In [None]:

print("Accuracy on test set: %0.3f%%"%(accuracy_score(Y_test, y_pred)*100))
from sklearn.metrics import f1_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
print("F1-Score on test set: %0.3f"%(f1_score(Y_test, y_pred)))
print("-"*20, "confusion matrix", "-"*20)
plt.figure(figsize=(8,8))
df_cm = pd.DataFrame(confusion_matrix(Y_test, y_pred), range(2),range(2))
sns.set(font_scale=1.4)#for label size
sns.heatmap(df_cm, annot=True,annot_kws={"size": 16}, fmt='g')
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.show()

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
print("F1-Score on test set: %0.3f"%(f1_score(Y_test, y_pred)))
print("-"*20, "confusion matrix", "-"*20)
plt.figure(figsize=(8,8))
df_cm = pd.DataFrame(confusion_matrix(Y_test, y_pred), range(2),range(2))
sns.set(font_scale=1.4)#for label size
sns.heatmap(df_cm, annot=True,annot_kws={"size": 16}, fmt='g')
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.show()

In [None]:

cm1 = confusion_matrix(y_pred, y_test)
print(cm1)
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

_____
# Parkinson's disease
_____

### 1. Looking at dataset

In [None]:
# data_pd = pd.read_csv("../input/parkinson-disease-detection/Parkinsson disease.csv")
# label_pd = data_pd["status"]
# data_pd.drop(["status","name"],axis = 1,inplace = True)

# print("Parkinson's disease dataset:\n",data_pd.shape[0],"Records\n",data_pd.shape[1],"Features")

In [None]:
data_pd3 = pd.read_csv("/kaggle/input/csv-files/data4.csv")
label_pd3 = data_pd3["diagnosis"]
data_pd3.drop(["id","diagnosis"],axis = 1,inplace = True)

print("disease dataset:\n",data_pd3.shape[0],"Records\n",data_pd3.shape[1],"Features")

In [None]:
display(data_pd3.head())
print("All the features in the dataset")

### 2. Checking Accuracy

In [None]:
score3 = acc_score(data_pd3,label_pd3)
score3

In [None]:

logmodel = AdaBoostClassifier(random_state=0)
X_train,X_test, Y_train, Y_test = split(data_pd3,label_pd3)
chromo_df_pd,score_pd=generations(data_pd3,label_pd3,size=80,n_feat=data_pd3.shape[1],n_parents=64,mutation_rate=0.20,n_gen=10,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

In [None]:

logmodel = RandomForestClassifier(random_state=0)
X_train,X_test, Y_train, Y_test = split(data_pd2,label_pd2)
chromo_df_pd,score_pd=generations(data_pd2,label_pd2,size=80,n_feat=data_pd2.shape[1],n_parents=64,mutation_rate=0.20,n_gen=10,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

In [None]:
logmodel = DecisionTreeClassifier(random_state=0)
X_train,X_test, Y_train, Y_test = split(data_pd2,label_pd2)
chromo_df_pd,score_pd=generations(data_pd2,label_pd2,size=80,n_feat=data_pd2.shape[1],n_parents=64,mutation_rate=0.20,n_gen=5,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

In [None]:

logmodel = RandomForestClassifier(random_state=0)
X_train,X_test, Y_train, Y_test = split(data_pd2,label_pd2)
chromo_df_pd,score_pd=generations(data_pd2,label_pd2,size=80,n_feat=data_pd2.shape[1],n_parents=64,mutation_rate=0.20,n_gen=5,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

In [None]:
y_pred=logmodel2.predict(X_test)

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
print("F1-Score on test set: %0.3f"%(f1_score(Y_test, y_pred)))
print("-"*20, "confusion matrix", "-"*20)
plt.figure(figsize=(8,8))
df_cm = pd.DataFrame(confusion_matrix(Y_test, y_pred), range(2),range(2))
sns.set(font_scale=1.4)#for label size
sns.heatmap(df_cm, annot=True,annot_kws={"size": 16}, fmt='g')
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.show()

### 3. Visualization

In [None]:
plot(score_pd,0.9,1.0,c = "orange")

### 1. Looking at dataset

In [None]:
data_pcos = pd.read_csv("../input/pcos-dataset/PCOS_data.csv")
label_pcos = data_pcos["PCOS (Y/N)"]
data_pcos.drop(["Sl. No","Patient File No.","PCOS (Y/N)","Unnamed: 44","II    beta-HCG(mIU/mL)","AMH(ng/mL)"],axis = 1,inplace = True)
data_pcos["Marraige Status (Yrs)"].fillna(data_pcos['Marraige Status (Yrs)'].describe().loc[['50%']][0], inplace = True) 
data_pcos["Fast food (Y/N)"].fillna(1, inplace = True) 

print("PCOS dataset:\n",data_pcos.shape[0],"Records\n",data_pcos.shape[1],"Features")

In [None]:
display(data_pcos.head())
print("The features in this dataset have both discrete and continuous values")

### 2. Checking Accuracy

In [None]:
score4 = acc_score(data_pcos,label_pcos)
score4

In [None]:
logmodel = RandomForestClassifier(n_estimators=200, random_state=0)
X_train,X_test, Y_train, Y_test = split(data_pcos,label_pcos)
chromo_df_pcos,score_pcos=generations(data_pcos,label_pcos,size=80,n_feat=data_pcos.shape[1],n_parents=64,mutation_rate=0.20,n_gen=5,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

#### We can see an improvement of 3-4%

### 3. Visualization

In [None]:
plot(score_pcos,0.9,1.0,c = "limegreen")