In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as

## End 2 end Scikit-Learn Workflow

In [2]:
# 1. Get the data
heart_disease = pd.read_csv("https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/heart-disease.csv")
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [3]:
# Create Feature matrix (X)
X = heart_disease.drop("target", axis=1)

# create y target
y = heart_disease["target"]

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train,y_test = train_test_split(X,y, test_size=0.2)

In [5]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((242, 13), (242,), (61, 13), (61,))

In [6]:
len(heart_disease)

303

In [7]:
model = RandomForestClassifier()

In [8]:
model.fit(X_train,y_train)

In [9]:
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [10]:
model_prediction = model.predict(X_test)

In [11]:
model_prediction

array([1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1], dtype=int64)

In [12]:
model.score(X_train, y_train)

1.0

In [13]:
model.score(X_test,y_test)

0.819672131147541

### Experiment to improve (hyperparameter tuning)

In [14]:
np.random.seed(42)
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    model = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f"Model accruacy on test set: {model.score(X_test, y_test)}")
    print("")

Trying model with 10 estimators...
Model accruacy on test set: 0.7540983606557377

Trying model with 20 estimators...
Model accruacy on test set: 0.7868852459016393

Trying model with 30 estimators...
Model accruacy on test set: 0.8032786885245902

Trying model with 40 estimators...
Model accruacy on test set: 0.8032786885245902

Trying model with 50 estimators...
Model accruacy on test set: 0.7704918032786885

Trying model with 60 estimators...
Model accruacy on test set: 0.8032786885245902

Trying model with 70 estimators...
Model accruacy on test set: 0.8360655737704918

Trying model with 80 estimators...
Model accruacy on test set: 0.8032786885245902

Trying model with 90 estimators...
Model accruacy on test set: 0.7868852459016393



In [15]:
# using cross-validation

from sklearn.model_selection import cross_val_score

np.random.seed(42)
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    model = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f"Model accruacy on test set: {model.score(X_test, y_test)}")
    print(f"Cross-validation score: {np.mean(cross_val_score(model, X, y, cv=5)) * 100}%")
    print("")

Trying model with 10 estimators...
Model accruacy on test set: 0.7540983606557377
Cross-validation score: 78.53551912568305%

Trying model with 20 estimators...
Model accruacy on test set: 0.7868852459016393
Cross-validation score: 79.84699453551912%

Trying model with 30 estimators...
Model accruacy on test set: 0.8032786885245902
Cross-validation score: 80.50819672131148%

Trying model with 40 estimators...
Model accruacy on test set: 0.7868852459016393
Cross-validation score: 82.15300546448088%

Trying model with 50 estimators...
Model accruacy on test set: 0.819672131147541
Cross-validation score: 81.1639344262295%

Trying model with 60 estimators...
Model accruacy on test set: 0.7868852459016393
Cross-validation score: 83.47540983606557%

Trying model with 70 estimators...
Model accruacy on test set: 0.7868852459016393
Cross-validation score: 81.83060109289617%

Trying model with 80 estimators...
Model accruacy on test set: 0.8032786885245902
Cross-validation score: 82.81420765027

## Saving model for later use

In [16]:
import pickle

# Save trained model to file
pickle.dump(model, open("random_forest_model_1.pkl", "wb"))

In [17]:
# Load a saved model and make a prediction on a single example
loaded_model = pickle.load(open("random_forest_model_1.pkl", "rb"))

In [18]:
df = pd.read_csv("https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/car-sales-extended-missing-data.csv")
df.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [19]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size=0.2)

In [20]:
test_data.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
90,Nissan,White,197844.0,4.0,5776.0
376,Nissan,White,64362.0,4.0,28993.0
788,Toyota,White,32748.0,4.0,30323.0
165,Nissan,Blue,211249.0,4.0,15767.0
336,Toyota,Green,165225.0,4.0,10841.0


## Data cleaning
### check for missing value

In [21]:
df.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [22]:
train_data["Make"].fillna("missing", inplace=True)
train_data["Colour"].fillna("missing", inplace=True)
train_data["Odometer (KM)"].fillna(df["Odometer (KM)"].median(), inplace=True)
train_data["Colour"].fillna("missing", inplace=True)

In [23]:
train_data_label = train_data["Price"]
train_data_feature = train_data.drop("Price", axis=1)

In [24]:
train_data.dropna(inplace=True)

In [25]:
train_data.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           951 non-null    object 
 1   Colour         950 non-null    object 
 2   Odometer (KM)  950 non-null    float64
 3   Doors          950 non-null    float64
 4   Price          950 non-null    float64
dtypes: float64(3), object(2)
memory usage: 39.2+ KB


In [27]:
cat_data = ["Make", "Colour","Doors"]

In [28]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


test_transformer = ColumnTransformer(
    [
     ("cat", OneHotEncoder(), cat_data)   
    ]
)
transformed_data=test_transformer.fit_transform(train_data_feature).toarray()

In [29]:
pd.DataFrame(transformed_data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
796,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
797,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
798,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [30]:
train_data_label.isna().sum()

39

In [31]:
# from sklearn.tree import DecisionTreeRegressor

# model = DecisionTreeRegressor()
# model.fit(transformed_data, train_data_label)

In [32]:
test_data_ft = test_data.drop("Price", axis=1)
test_data_label = test_data["Price"]

In [33]:
car_sales_missing_data = pd.read_csv("https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/car-sales-extended-missing-data.csv")
car_sales_missing_data.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [34]:
car_sales_missing_data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

### Handling Missing value with Scikit-learn

In [35]:
# drop row with missing price value
car_sales_missing_data.dropna(subset=["Price"], inplace=True)

In [36]:
car_sales_missing_data.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [37]:
car_sales_missing_data["Doors"].mean()

4.007751937984496

In [38]:
car_sales_missing_data.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [39]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

car_sales_missing_data.dropna(subset=["Price"], inplace=True)
train_set, test_set = train_test_split(car_sales_missing_data, random_state=42)

In [40]:
car_sales_missing_data.isna().sum()


Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [41]:
test_set

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
203,Toyota,Blue,99761.0,4.0,10547.0
979,Toyota,Black,17975.0,4.0,17940.0
729,Honda,Blue,197664.0,4.0,12950.0
838,Nissan,Green,235589.0,4.0,5905.0
919,Honda,Black,231659.0,4.0,9826.0
...,...,...,...,...,...
334,Honda,Blue,247869.0,4.0,11982.0
567,Toyota,White,122266.0,4.0,17664.0
29,Toyota,White,112004.0,4.0,13586.0
55,BMW,White,79937.0,5.0,48686.0


In [42]:
X = train_set.drop("Price", axis=1)
y = train_set["Price"].copy()
X_test = test_set.drop("Price", axis=1)
y_test = test_set["Price"].copy()

In [43]:
y_test

203    10547.0
979    17940.0
729    12950.0
838     5905.0
919     9826.0
        ...   
334    11982.0
567    17664.0
29     13586.0
55     48686.0
663    10241.0
Name: Price, Length: 238, dtype: float64

In [44]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="median")
num_imputer = SimpleImputer(strategy="median")


transf = ColumnTransformer([
    ("cat_imputer", cat_imputer, ["Make","Colour"]),
     ("odometer_imputer", num_imputer, ["Odometer (KM)"]),
    ("door_imputer", door_imputer,["Doors"] ), 
])


train_filled_X = transf.fit_transform(X)
test_filled_x = transf.transform(X_test)

In [45]:
test_filled_x

array([['Toyota', 'Blue', 99761.0, 4.0],
       ['Toyota', 'Black', 17975.0, 4.0],
       ['Honda', 'Blue', 197664.0, 4.0],
       ['Nissan', 'Green', 235589.0, 4.0],
       ['Honda', 'Black', 231659.0, 4.0],
       ['Toyota', 'Blue', 247601.0, 4.0],
       ['Toyota', 'Green', 110078.0, 4.0],
       ['missing', 'White', 155383.0, 4.0],
       ['Nissan', 'White', 26634.0, 4.0],
       ['Honda', 'White', 132055.0, 4.0],
       ['Honda', 'Green', 238825.0, 4.0],
       ['Honda', 'Green', 37606.0, 4.0],
       ['Toyota', 'Blue', 230908.0, 4.0],
       ['Toyota', 'Red', 159925.0, 4.0],
       ['Toyota', 'Blue', 181466.0, 4.0],
       ['Toyota', 'Blue', 140465.0, 4.0],
       ['Toyota', 'White', 146307.0, 4.0],
       ['Toyota', 'Green', 214179.0, 4.0],
       ['Honda', 'White', 184869.0, 4.0],
       ['Toyota', 'Black', 224986.0, 4.0],
       ['Nissan', 'White', 176135.0, 3.0],
       ['Nissan', 'Red', 132055.0, 4.0],
       ['Toyota', 'Blue', 112223.0, 4.0],
       ['BMW', 'missing', 20644

In [46]:
X_test

Unnamed: 0,Make,Colour,Odometer (KM),Doors
203,Toyota,Blue,99761.0,4.0
979,Toyota,Black,17975.0,4.0
729,Honda,Blue,197664.0,4.0
838,Nissan,Green,235589.0,4.0
919,Honda,Black,231659.0,4.0
...,...,...,...,...
334,Honda,Blue,247869.0,4.0
567,Toyota,White,122266.0,4.0
29,Toyota,White,112004.0,4.0
55,BMW,White,79937.0,5.0


In [47]:
car_sales_filled_train = pd.DataFrame(train_filled_X, 
                                      columns=X.columns)

car_sales_filled_test = pd.DataFrame(test_filled_x, 
                                     columns=X_test.columns)

# Check missing data in training set
car_sales_filled_train.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
dtype: int64

In [48]:
# convert the catogical data to numrical features

from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
# split data into X,Y

cat_transf = ColumnTransformer([
    ("cat", OneHotEncoder(), ["Make","Colour","Doors"])
], remainder="passthrough")

In [49]:
transformed_X_train = cat_transf.fit_transform(car_sales_filled_train)
transformed_X_test = cat_transf.transform(car_sales_filled_test)

# Check transformed and filled X_train
transformed_X_train.toarray()

array([[0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.65225e+05],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 1.43204e+05],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.78899e+05],
       ...,
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.96225e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.33117e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.50582e+05]])

In [50]:
y_train

280    0
104    1
177    0
263    0
176    0
      ..
295    0
32     1
103    1
290    0
225    0
Name: target, Length: 242, dtype: int64

In [51]:
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model2 = DecisionTreeRegressor()

# Make sure to use transformed (filled and one-hot encoded X data)
model.fit(transformed_X_train, y)

model.score(transformed_X_test, y_test)

0.21393465108824272

In [52]:
model2.fit(transformed_X_train,y )

In [53]:
model2.score(transformed_X_test,y_test)

-0.2133613265026375

### choosing the right Estimator/ ML Algorithm 

In [54]:
# using boston data
from sklearn.datasets import load_boston

ImportError: 
`load_boston` has been removed from scikit-learn since version 1.2.

The Boston housing prices dataset has an ethical problem: as
investigated in [1], the authors of this dataset engineered a
non-invertible variable "B" assuming that racial self-segregation had a
positive impact on house prices [2]. Furthermore the goal of the
research that led to the creation of this dataset was to study the
impact of air quality but it did not give adequate demonstration of the
validity of this assumption.

The scikit-learn maintainers therefore strongly discourage the use of
this dataset unless the purpose of the code is to study and educate
about ethical issues in data science and machine learning.

In this special case, you can fetch the dataset from the original
source::

    import pandas as pd
    import numpy as np

    data_url = "http://lib.stat.cmu.edu/datasets/boston"
    raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
    data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
    target = raw_df.values[1::2, 2]

Alternative datasets include the California housing dataset and the
Ames housing dataset. You can load the datasets as follows::

    from sklearn.datasets import fetch_california_housing
    housing = fetch_california_housing()

for the California housing dataset and::

    from sklearn.datasets import fetch_openml
    housing = fetch_openml(name="house_prices", as_frame=True)

for the Ames housing dataset.

[1] M Carlisle.
"Racist data destruction?"
<https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>

[2] Harrison Jr, David, and Daniel L. Rubinfeld.
"Hedonic housing prices and the demand for clean air."
Journal of environmental economics and management 5.1 (1978): 81-102.
<https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>


In [55]:
from sklearn.datasets import fetch_california_housing
import pandas as  pd
import numpy as np

housing = fetch_california_housing()


In [56]:
type(housing)

sklearn.utils._bunch.Bunch

In [57]:
print(housing.data.shape, housing.target.shape)

(20640, 8) (20640,)


In [58]:
x = housing.data

In [59]:
X = pd.DataFrame(x)
X.columns = housing.feature_names
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [60]:
y_without_label = housing.target
# y_without_label.shape
y = pd.DataFrame(y_without_label)
y.columns = housing.target_names
y

Unnamed: 0,MedHouseVal
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422
...,...
20635,0.781
20636,0.771
20637,0.923
20638,0.847


In [61]:
housing_df = fetch_california_housing(as_frame=True)
housing_df

{'data':        MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
 0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
 1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
 2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
 3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
 4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
 ...       ...       ...       ...        ...         ...       ...       ...   
 20635  1.5603      25.0  5.045455   1.133333       845.0  2.560606     39.48   
 20636  2.5568      18.0  6.114035   1.315789       356.0  3.122807     39.49   
 20637  1.7000      17.0  5.205543   1.120092      1007.0  2.325635     39.43   
 20638  1.8672      18.0  5.329513   1.171920       741.0  2.123209     39.43   
 20639  2.3886      16.0  5.254717   1.162264      1387.0  2.616981     39.37   
 
        Longitude 

In [62]:
housing_X = pd.DataFrame(housing_df["data"])
housing_X

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [63]:
housing_y = pd.DataFrame(housing_df["target"])
housing_y

Unnamed: 0,MedHouseVal
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422
...,...
20635,0.781
20636,0.771
20637,0.923
20638,0.847


In [64]:
housing = housing_X.copy()

In [65]:
housing["target"] = housing_y

In [66]:
housing.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [67]:
housing.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [68]:
from sklearn.model_selection import train_test_split

train_data,test_data = train_test_split(housing,test_size=0.2,random_state=42)

In [69]:
from sklearn.linear_model import SGDRegressor, Ridge

train_features = train_data.drop("target", axis=1)
train_label = train_data["target"]


test_X = test_data.drop("target",axis=1)
test_y = test_data["target"]

sgd_model = SGDRegressor(random_state=42)
sgd_model.fit(train_features, train_label)

In [70]:
sgd_model.score(test_X,test_y)

-6.40655841595438e+28

In [71]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error



In [72]:
cross_score = cross_val_score(sgd_model,housing_X,housing_y.squeeze(), cv=10)

In [73]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [74]:
display_scores(cross_score)

Scores: [-7.34069305e+29 -1.38231335e+29 -5.48610677e+30 -1.41687036e+30
 -8.24658785e+29 -3.66825793e+28 -2.21504787e+30 -8.80971620e+30
 -3.39643802e+29 -2.03968283e+29]
Mean: -2.0204995290788495e+30
Standard deviation: 2.7421629528636194e+30


In [75]:
ridge_model = Ridge()
ridge_model.fit(train_features, train_label)

In [76]:
ridge_model.score(test_X, test_y)

0.5758549611440125

In [77]:
# using ensemble model
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor()
rf_model.fit(train_features, train_label)

In [78]:
rf_score = cross_val_score(rf_model,train_features, train_label, cv=10)
display_scores(rf_score)

Scores: [0.82678498 0.80314985 0.80901145 0.7815363  0.79582143 0.8293117
 0.81716799 0.79739619 0.81497474 0.80322936]
Mean: 0.8078383976113995
Standard deviation: 0.013956695901808681


In [81]:
rf_model.score(test_X, test_y)

0.8047552475125127

### Classification Problem

In [82]:
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [83]:
# Using linear SVC
ht_train_data,ht_test_data = train_test_split(heart_disease, random_state=42, test_size=0.2)

In [84]:
ht_train_data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0
mean,54.46281,0.657025,0.991736,130.359504,246.842975,0.128099,0.553719,150.115702,0.31405,1.013223,1.421488,0.681818,2.301653,0.549587
std,9.204492,0.475687,1.022533,16.828858,52.795465,0.334893,0.53041,22.352398,0.465098,1.102577,0.607724,0.99062,0.593811,0.498566
min,29.0,0.0,0.0,94.0,131.0,0.0,0.0,88.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,212.0,0.0,0.0,136.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.5,1.0,1.0,130.0,239.5,0.0,1.0,154.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.75,0.0,1.0,165.75,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,192.0,564.0,1.0,2.0,202.0,1.0,5.6,2.0,4.0,3.0,1.0


In [88]:
corrleation =heart_disease.corr()
corrleation["target"].sort_values(ascending=False)

target      1.000000
cp          0.433798
thalach     0.421741
slope       0.345877
restecg     0.137230
fbs        -0.028046
chol       -0.085239
trestbps   -0.144931
age        -0.225439
sex        -0.280937
thal       -0.344029
ca         -0.391724
oldpeak    -0.430696
exang      -0.436757
Name: target, dtype: float64

In [105]:
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
ht_X = ht_train_data.drop("target", axis=1)
ht_y = ht_train_data['target']


pipe = Pipeline([
    ("scaler", StandardScaler()),("lin_svc",LinearSVC(random_state=42,max_iter=5000) )
])

pipe.fit(ht_X, ht_y)

# lin_svc_model = LinearSVC(random_state=42,)
# lin_svc_model.fit(ht_X, ht_y)

In [106]:
ht_test_X = ht_test_data.drop("target", axis=1)
ht_test_y = ht_test_data["target"]
pipe.score(ht_test_X, ht_test_y)

0.8688524590163934

In [107]:
pipe_parameters = pipe.get_params()
pipe_parameters

{'memory': None,
 'steps': [('scaler', StandardScaler()),
  ('lin_svc', LinearSVC(max_iter=5000, random_state=42))],
 'verbose': False,
 'scaler': StandardScaler(),
 'lin_svc': LinearSVC(max_iter=5000, random_state=42),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'lin_svc__C': 1.0,
 'lin_svc__class_weight': None,
 'lin_svc__dual': True,
 'lin_svc__fit_intercept': True,
 'lin_svc__intercept_scaling': 1,
 'lin_svc__loss': 'squared_hinge',
 'lin_svc__max_iter': 5000,
 'lin_svc__multi_class': 'ovr',
 'lin_svc__penalty': 'l2',
 'lin_svc__random_state': 42,
 'lin_svc__tol': 0.0001,
 'lin_svc__verbose': 0}

In [108]:
lin_svc_model = pipe_parameters["lin_svc"]

In [110]:
# using cross-validation for classifier 
svc_score = cross_val_score(pipe, ht_X, ht_y, cv=10)
display_scores(svc_score)

Scores: [0.72       0.76       0.875      0.875      0.75       0.75
 0.79166667 0.875      0.70833333 0.875     ]
Mean: 0.7979999999999999
Standard deviation: 0.06628138669507617


In [111]:
# using Random classifier
from sklearn.ensemble import RandomForestClassifier


rand_class_model = RandomForestClassifier()
rand_class_model.fit(ht_X, ht_y)
rand_class_model.score(ht_test_X, ht_test_y)

0.8360655737704918

* Categorical - mode and countplot
* Numeric - mean, median and histogram
* Numeric Numeric - correlation, scatter
* Categorical Categorical - pivot
* Categorical Numeric - violin plot 

## Fitting the Model/Algorithm