<a href="https://colab.research.google.com/github/RafsanJany-44/Machine-School/blob/main/K_fold_CV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Libraries

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
#data = 'https://raw.githubusercontent.com/RafsanJany-44/Machine-School/main/datasets/HMC_WITH_BIOM.csv'
data="https://raw.githubusercontent.com/RafsanJany-44/Research-NREM-REM/main/dataset/REM_NREM.csv"
dataset = pd.read_csv(data)


#Spliting

In [2]:
X = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0)


$Models with out KFCV

**Logistic Regression,SVM,Random Forest**

In [3]:
lr = LogisticRegression(solver='liblinear',multi_class='ovr')
lr.fit(X_train, y_train)
print("Logistic Reg Accuracy: ",lr.score(X_test, y_test))


svm = SVC(gamma='auto')
svm.fit(X_train, y_train)
print("SVM Accuracy: ",svm.score(X_test, y_test))

rf = RandomForestClassifier(n_estimators=40)
rf.fit(X_train, y_train)
print("Random Forest: ",rf.score(X_test, y_test))


Logistic Reg Accuracy:  0.843105911162506
SVM Accuracy:  0.9193932385552003
Random Forest:  0.9233045371063193


<h2 style='color:purple'>KFold cross validation</h2>

**Basic example**

In [4]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [5]:
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
    print(train_index, test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


**Mannualy Function**

In [6]:
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)


from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=3)

scores_logistic = []
scores_svm = []
scores_rf = []

for train_index, test_index in folds.split(X,y):
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], \
                                       y[train_index], y[test_index]
    scores_logistic.append(get_score(LogisticRegression(solver='liblinear',multi_class='ovr'), X_train, X_test, y_train, y_test))  
    scores_svm.append(get_score(SVC(gamma='auto'), X_train, X_test, y_train, y_test))
    scores_rf.append(get_score(RandomForestClassifier(n_estimators=40), X_train, X_test, y_train, y_test))

print("Logistic Reg Accuracy: ",scores_logistic)
print("SVN Accuracy: ",scores_svm)
print("RF Accuracy: ",scores_rf)

Logistic Reg Accuracy:  [0.8151789622546214, 0.8417455133169467, 0.8295844838036232]
SVN Accuracy:  [0.8182767096535237, 0.8043368463584633, 0.8088086739847801]
RF Accuracy:  [0.8384120677463888, 0.8626889794269167, 0.8474644757222709]


<h2 style='color:purple'>cross_val_score function</h2>

In [7]:
from sklearn.model_selection import cross_val_score

**Logistic regression, SVM, Random forest using cross val score**

In [None]:
print("Logistic regression Accurecy for 3 fold:",cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), X, y,cv=5))
print("Random Forest Performance Accurecy for 3 fold:",cross_val_score(RandomForestClassifier(n_estimators=40),X, y,cv=5))
print("SVM Accurecy for 3 fold:",cross_val_score(SVC(gamma='auto'), X, y,cv=5))


Logistic regression Accurecy for 3 fold: [0.82435466 0.82675795 0.83455862 0.84022672 0.83495146]
Random Forest Performance Accurecy for 3 fold: [0.85763187 0.84090016 0.84696111 0.85403221 0.84005836]


**AVG SCORE**

In [None]:

Kfolds=5
l=list(cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), X, y,cv=Kfolds))
avg=sum(l)/Kfolds



print("Logistic regression Accurecy for 3 fold:",avg)

<h2 style='color:purple'>CV Parameter tunning</h2>

In [None]:
from sklearn.linear_model import LogisticRegression

Kfolds=50
Kfolds_range = range (2,Kfolds+1)
scores={}
scores_list = []
for k in Kfolds_range:
  l=list(cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), X, y,cv=k))
  avg=sum(l)/k
  scores[k] = avg
  scores_list.append(avg)
  print(str(k)+"/"+str(Kfolds)+" round completed.........................AVG Accurecy: "+str(avg))

In [None]:

print("The best number of Folds:")
print(list(scores.keys())[scores_list.index(max(scores_list))])

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.figure(figsize = (25,10))
plt.plot(Kfolds_range,scores_list)
plt.xlabel('Value of K Folds')
plt.ylabel ('AVG Accuracy')

In [None]:
import plotly.express as px
import pandas as pd

df = pd.DataFrame(dict(
    x = Kfolds_range,
    y = scores_list
))

fig = px.line(df, x="x", y="y", title="K Folds AVG Accuracy") 
fig.show()

17 jun 2022------------------------------------------------

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
kf

In [None]:
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
    print(train_index, test_index)

In [None]:
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [None]:
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=3)

scores_logistic = []
scores_svm = []
scores_rf = []

for train_index, test_index in folds.split(digits.data,digits.target):
    X_train, X_test, y_train, y_test = digits.data[train_index], digits.data[test_index], \
                                       digits.target[train_index], digits.target[test_index]
    scores_logistic.append(get_score(LogisticRegression(solver='liblinear',multi_class='ovr'), X_train, X_test, y_train, y_test))  
    scores_svm.append(get_score(SVC(gamma='auto'), X_train, X_test, y_train, y_test))
    scores_rf.append(get_score(RandomForestClassifier(n_estimators=40), X_train, X_test, y_train, y_test))