

# Decision Tree and Cross Validation

In [None]:
import numpy as np

In [None]:
y = np.random.choice(('Male','Female'), size = (10))

In [None]:
y

array(['Female', 'Male', 'Female', 'Female', 'Female', 'Female', 'Female',
       'Female', 'Female', 'Male'], dtype='<U6')

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()  #labeling male with 1 and female with 0

In [None]:
yt = le.fit_transform(y)

In [None]:
print(yt)

[0 1 0 0 0 0 0 0 0 1]


In [None]:
from sklearn.impute import SimpleImputer

In [None]:
data = np.array([[1, np.nan, 2], [2,3,np.nan],[-1,4,2]])

In [None]:
imp = SimpleImputer(strategy = 'mean')  #handling missing data using mean(takes the mean of the respective column)

In [None]:
imp.fit_transform(data)

array([[ 1. ,  3.5,  2. ],
       [ 2. ,  3. ,  2. ],
       [-1. ,  4. ,  2. ]])

In [None]:
imp = SimpleImputer(strategy = 'median')  #handling missing data using median

In [None]:
imp.fit_transform(data)

array([[ 1. ,  3.5,  2. ],
       [ 2. ,  3. ,  2. ],
       [-1. ,  4. ,  2. ]])

In [None]:
imp = SimpleImputer(strategy = 'most_frequent')  #handling missing data using most frequent

In [None]:
imp.fit_transform(data)

array([[ 1.,  3.,  2.],
       [ 2.,  3.,  2.],
       [-1.,  4.,  2.]])

In [None]:
data = [
    {'price': 850000, 'rooms' : 4, 'neighbourhood': 'Queen Anne'},
    {'price': 700000, 'rooms' : 3, 'neighbourhood': 'Fremont'},
    {'price': 650000, 'rooms' : 3, 'neighbourhood': 'Wallingford'},
    {'price': 600000, 'rooms' : 2, 'neighbourhood': 'Fremont'}
]

In [None]:
from sklearn.feature_extraction import DictVectorizer

In [None]:
vec = DictVectorizer(sparse = False, dtype = int)
vec.fit_transform(data)  #creates a column each for neighbourhood and then 1 for present and 0 fo rnot present

array([[     0,      1,      0, 850000,      4],
       [     1,      0,      0, 700000,      3],
       [     0,      0,      1, 650000,      3],
       [     1,      0,      0, 600000,      2]])

In [None]:
vec = DictVectorizer(sparse = True, dtype = int)  #true means convert the sparse matrix into non-sparse matrix
d = vec.fit_transform(data)
print(d)

  (0, 1)	1
  (0, 3)	850000
  (0, 4)	4
  (1, 0)	1
  (1, 3)	700000
  (1, 4)	3
  (2, 2)	1
  (2, 3)	650000
  (2, 4)	3
  (3, 0)	1
  (3, 3)	600000
  (3, 4)	2


In [None]:
sample = ['problem of evil evil',
          'evil queen',
          'horizon problem']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vec = CountVectorizer()
#X = vec.fit_transform(sample)
vec.fit(sample)
X=vec.transform(sample)
X

<3x5 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [None]:
print(X)

  (0, 0)	2
  (0, 2)	1
  (0, 3)	1
  (1, 0)	1
  (1, 4)	1
  (2, 1)	1
  (2, 3)	1


In [None]:
import pandas as pd
pd.DataFrame(X.toarray(), columns = vec.get_feature_names_out())

Unnamed: 0,evil,horizon,of,problem,queen
0,2,0,1,1,0
1,1,0,0,0,1
2,0,1,0,1,0


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
x = vec.fit_transform(sample)
pd.DataFrame(x.toarray(), columns= vec.get_feature_names_out())

Unnamed: 0,evil,horizon,of,problem,queen
0,0.771006,0.0,0.50689,0.385503,0.0
1,0.605349,0.0,0.0,0.0,0.795961
2,0.0,0.795961,0.0,0.605349,0.0


In [None]:
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import matplotlib.pyplot as plt
x = np.array([1,2,3,4,5])
X = x[:, np.newaxis]
#y = np.array([4,2,1,3,7])
poly = PolynomialFeatures(degree = 3, include_bias = False)
x2 = poly.fit_transform(X)
print(x)
print(x2)

[1 2 3 4 5]
[[  1.   1.   1.]
 [  2.   4.   8.]
 [  3.   9.  27.]
 [  4.  16.  64.]
 [  5.  25. 125.]]


In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from numpy import nan
x = np.array([
    [nan, 0, 3],
    [3, 7, 9],
    [3, 5, 2],
    [4, nan, 6],
    [8, 8, 1]
])
y = np.array([14, 16, -1, 8, -5])
model = make_pipeline(SimpleImputer(strategy='mean'),
                     PolynomialFeatures(degree=2),
                     LinearRegression()) #using pipeline for all preprocessing tasks

In [None]:
model.fit(x,y)

In [None]:
x.shape

(5, 3)

In [None]:
print(model.predict(x[1:2]))

[16.]


In [None]:
!pip install mlxtend



In [None]:
#estimate the bias and variance for a regression model
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from mlxtend.evaluate import bias_variance_decomp
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv"
dataframe = read_csv(url, header=None)
#separate inputs and outputs
data = dataframe.values
x, y = data[: , :-1], data[:, -1]
#split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=1)
#define the model
model = LinearRegression()
#estimate the bias and variance
mse, bias, var = bias_variance_decomp(model, x_train, y_train, x_test, y_test, loss="mse",num_rounds=200, random_seed=1)

In [None]:
#summarize results
print("MSE: %.3F" % mse)
print("Bias: %.3F" % bias)
print("Variance: %.3F" % var)
#mean_square_error

MSE: 22.418
Bias: 20.744
Variance: 1.674


In [None]:
!pip install imblearn



In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
df=pd.read_excel("C://Users//computer//Downloads//L.xlsx")
numerical_features=['LungCap(cc)', 'Age(years)', 'Height(inches)']
categorical_features=['Smoke','Gender']
dummy = pd.get_dummies(df, drop_first=True)
dummy.shape
Y= dummy[['Risk']]
X
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.25,random_state=0)
len(x_train), len(x_test), len(y_train), len(y_test)
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)

from sklearn.tree import DecisionTreeClassifier
regressor = DecisionTreeClassifier()
regressor.fit(x_train, y_train)  #training of classifier

#doesn't work with categorical data

In [None]:
Y

Unnamed: 0,Risk
0,0
1,0
2,1
3,0
4,0
...,...
720,0
721,1
722,0
723,0


In [None]:
x_train_new, y_train_new = sm.fit_resample(x_train, y_train)
y_train.value_counts()

Risk
0       416
1       127
dtype: int64

In [None]:
y_train_new.value_counts()

Risk
0       416
1       416
dtype: int64

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)

In [None]:
data = [1,2,3,4,5,6,7,8,9]
for train,test in kf.split(data):
    print(train,test)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


In [None]:
from  sklearn.model_selection import cross_val_score

In [None]:
accuracies = cross_val_score(estimator = regressor, X=x_train, y=y_train, cv=5) #change the value of cv, till 50 it will increase after 50 it will be constant for a particular value and then start declining again

In [None]:
accuracies

array([0.57798165, 0.65137615, 0.68807339, 0.69444444, 0.67592593])