# Evaluation using K-Fold Cross Validation

In [1]:
import pandas as pd
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [2]:
filename=("/content/pima-indians-diabetes.data.csv")
names=['preg','plas','pres','skin','test','mass','pedi','age','class']
dataframe=read_csv(filename,names=names)
df=dataframe.drop(0)
array=df.values

In [3]:
df

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [4]:
y=df.iloc[:,8]
y

1      0
2      1
3      0
4      1
5      0
      ..
763    0
764    0
765    0
766    1
767    0
Name: class, Length: 767, dtype: int64

In [5]:
X=array[:,0:8]
Y=array[:,8]

In [6]:
kfold=KFold(n_splits=5)
model=LogisticRegression(max_iter=400)
results=cross_val_score(model,X,Y,cv=kfold)  # cv= no. of split as same as kfold=KFold(n_splits=5) 

In [7]:
results

array([0.77272727, 0.70779221, 0.77124183, 0.82352941, 0.77124183])

In [8]:
results.mean()*100

76.93065104829809

In [9]:
results.std()*100

3.672417459125146

# Evaluation using Leave One Out Cross Validation

In [10]:
import pandas as pd
from pandas import read_csv
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [11]:
filename=("/content/pima-indians-diabetes.data.csv")
names=['preg','plas','pres','skin','test','mass','pedi','age','class']
dataframe=read_csv(filename,names=names)
df=dataframe.drop(0)
array=df.values

In [12]:
df.shape

(767, 9)

In [13]:
X=array[:,0:8]
Y=array[:,8]

In [14]:
loocv=LeaveOneOut()  # it will consider 766 rows (as df.shape is [767, 9])leaving one out
model=LogisticRegression(max_iter=300)
result=cross_val_score(model,X,Y,cv=loocv)

In [15]:
results.mean()*100

76.93065104829809

In [16]:
result.std()*100

41.53519581934703

# Measuring Model accuracy 

In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
from pandas import read_csv
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA

filename=("/content/Airline passengers.csv")
data=read_csv(filename)

In [18]:
data

Unnamed: 0,Month,Passengers
0,1949-01,112
1,1949-02,118
2,1949-03,132
3,1949-04,129
4,1949-05,121
...,...,...
139,1960-08,606
140,1960-09,508
141,1960-10,461
142,1960-11,390


In [19]:
data['year']=pd.DatetimeIndex(data['Month']).year
data['month']=pd.DatetimeIndex(data['Month']).month

In [20]:
data

Unnamed: 0,Month,Passengers,year,month
0,1949-01,112,1949,1
1,1949-02,118,1949,2
2,1949-03,132,1949,3
3,1949-04,129,1949,4
4,1949-05,121,1949,5
...,...,...,...,...
139,1960-08,606,1960,8
140,1960-09,508,1960,9
141,1960-10,461,1960,10
142,1960-11,390,1960,11


In [21]:
x=data[[data.columns[-1]]]
y=data[['Passengers']]

In [22]:
x
y

Unnamed: 0,Passengers
0,112
1,118
2,132
3,129
4,121
...,...
139,606
140,508
141,461
142,390


In [23]:
df=data.drop(0)
array=df.values
X=array[x]
Y=array[data['month']]
df

Unnamed: 0,Month,Passengers,year,month
1,1949-02,118,1949,2
2,1949-03,132,1949,3
3,1949-04,129,1949,4
4,1949-05,121,1949,5
5,1949-06,135,1949,6
...,...,...,...,...
139,1960-08,606,1960,8
140,1960-09,508,1960,9
141,1960-10,461,1960,10
142,1960-11,390,1960,11


In [24]:
from sklearn.model_selection import train_test_split

In [25]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=20,random_state=2)

In [26]:
x_train.shape

(124, 1)

In [27]:
from sklearn.tree import DecisionTreeClassifier,plot_tree
dtc=DecisionTreeClassifier(criterion="entropy",random_state=40)
dtc.fit(x_train,y_train)


DecisionTreeClassifier(criterion='entropy', random_state=40)

In [28]:
dtc.feature_importances_

array([1.])

In [29]:
pred=dtc.predict(x_test)
pred

array([149, 148, 149, 149, 112, 348, 118, 141, 348, 141, 112, 149, 112,
       118, 348, 112, 141, 141, 118, 404])

In [30]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 143 entries, 1 to 143
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Month       143 non-null    object
 1   Passengers  143 non-null    int64 
 2   year        143 non-null    int64 
 3   month       143 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 5.6+ KB
None


In [31]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 143 entries, 1 to 143
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Month       143 non-null    object
 1   Passengers  143 non-null    int64 
 2   year        143 non-null    int64 
 3   month       143 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 5.6+ KB
None


In [32]:
from sklearn.tree import DecisionTreeRegressor

In [33]:
dtr=DecisionTreeRegressor(min_samples_split=6)

In [34]:
dtr.fit((x_train),y_train)

DecisionTreeRegressor(min_samples_split=6)

In [35]:
pred=dtr.predict(x_test)

In [36]:
from sklearn.metrics import mean_squared_error

In [37]:
mean_squared_error(y_test,pred)

19184.40926499847

In [38]:
dtr.score(x_test,y_test)

-0.052987557376400485

In [41]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,ConfusionMatrixDisplay

In [None]:
ConfusionMatrixDisplay.from_predictions(y,x)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7fb12609e280>