In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Exercises Ensemble models

1.- Use dataset Autoprice to train several regression models. You can choose your favourites among **Bagging, Stacking and Boosting** models. Train at least 2 regressions models with the best hyperparameters (You can find the best ones using GridSearch)

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix, roc_curve, roc_auc_score
from imblearn.over_sampling import RandomOverSampler
# RepeatedStratifiedKFold for classification
# RepeatedKFold for regressio

from sklearn.model_selection import RepeatedKFold, RepeatedStratifiedKFold, GridSearchCV, cross_val_score
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble import StackingClassifier, StackingRegressor, RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR
from sklearn import metrics

from tabulate import tabulate

import warnings
warnings.filterwarnings("ignore")

In [6]:
autoprice = pd.read_csv('/content/drive/MyDrive/Ironhack/16octubre/autoprice.csv', sep = ';')
autoprice.head()

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,95,109.1,188.8,68.9,55.5,3062,3.78,3.15,9.5,114,5400,19,25,22.625
1,115,98.8,177.8,66.5,55.5,2425,3.39,3.39,8.6,84,4800,26,32,11.245
2,104,99.1,186.6,66.5,56.1,2758,3.54,3.07,9.3,110,5250,21,28,15.51
3,161,93.7,157.3,64.4,50.8,1918,2.97,3.23,9.4,68,5500,37,41,5.389
4,78,96.5,157.1,63.9,58.3,2024,2.92,3.41,9.2,76,6000,30,34,7.295


### Random Forest Regressor

In [7]:
X=autoprice.drop('price', axis=1)
y=autoprice['price']

In [8]:
auto_X_train, auto_X_test, auto_y_train, auto_y_test = train_test_split(X, y, train_size = 0.8, random_state = 0)
auto_X_train.head(3)

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg
211,103,94.5,170.2,63.8,53.5,2037,3.15,3.29,9.4,69,5200,31,37
500,81,95.7,169.7,63.6,59.1,2290,3.05,3.03,9.0,62,4800,27,32
497,93,106.7,187.5,70.3,54.9,3495,3.58,3.64,21.5,123,4350,22,25


In [9]:
RF_Reg = RandomForestRegressor()

grid = dict()
grid['n_estimators'] = [10, 50] # number of trees
grid['criterion'] = ['squared_error','absolute_error']



# define the evaluation procedure
cv = RepeatedKFold(n_splits = 5, random_state = 1)

# define the grid search procedure
grid_search = GridSearchCV(estimator = RF_Reg, param_grid = grid, n_jobs = -1, cv = cv, scoring = 'neg_mean_absolute_error')

# execute the grid search
grid_result = grid_search.fit(auto_X_train, auto_y_train)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: -0.395324 using {'criterion': 'squared_error', 'n_estimators': 50}
-0.414971 (0.105279) with: {'criterion': 'squared_error', 'n_estimators': 10}
-0.395324 (0.091589) with: {'criterion': 'squared_error', 'n_estimators': 50}
-0.430228 (0.097740) with: {'criterion': 'absolute_error', 'n_estimators': 10}
-0.416322 (0.094716) with: {'criterion': 'absolute_error', 'n_estimators': 50}


In [10]:
RF_Reg = RandomForestRegressor(criterion='squared_error', n_estimators = 50)
RF_Reg.fit(auto_X_train, auto_y_train)

In [11]:
auto_X_test['RF_Reg'] = RF_Reg.predict(auto_X_test)
auto_X_train['RF_Reg'] = RF_Reg.predict(auto_X_train)

In [12]:
print("MAE: ", metrics.mean_absolute_error(auto_y_train, auto_X_train['RF_Reg']).round(4))
print("MSE: ", metrics.mean_squared_error(auto_y_train, auto_X_train['RF_Reg']).round(4))
print("RMSE: ", np.sqrt(metrics.mean_squared_error(auto_y_train, auto_X_train['RF_Reg'])).round(4))
print("MAPE: ", metrics.mean_absolute_percentage_error(auto_y_train, auto_X_train['RF_Reg']).round(4))
print("R2: ", metrics.r2_score(auto_y_train, auto_X_train['RF_Reg']).round(4))

MAE:  0.1379
MSE:  0.1211
RMSE:  0.348
MAPE:  0.0105
R2:  0.9962


In [13]:
print("MAE: ", metrics.mean_absolute_error(auto_y_test, auto_X_test['RF_Reg']).round(4))
print("MSE: ", metrics.mean_squared_error(auto_y_test, auto_X_test['RF_Reg']).round(4))
print("RMSE: ", np.sqrt(metrics.mean_squared_error(auto_y_test, auto_X_test['RF_Reg'])).round(4))
print("MAPE: ", metrics.mean_absolute_percentage_error(auto_y_test, auto_X_test['RF_Reg']).round(4))
print("R2: ", metrics.r2_score(auto_y_test, auto_X_test['RF_Reg']).round(4))

MAE:  0.2525
MSE:  0.345
RMSE:  0.5874
MAPE:  0.0206
R2:  0.9901


### Stacking Regression

In [14]:
level0 = list()
level0.append(('lr', LinearRegression()))
level0.append(('RF', RandomForestRegressor()))
level0.append(('svr', SVR()))

level1 = LinearRegression()

# define the stacking ensemble
St_reg = StackingRegressor(estimators = level0, final_estimator = level1)

# fit the model on all available data
St_reg.fit(auto_X_train, auto_y_train)

In [15]:
auto_X_test['St_reg'] = St_reg.predict(auto_X_test)
auto_X_train['St_reg'] = St_reg.predict(auto_X_train)

In [16]:
print("MAE: ", metrics.mean_absolute_error(auto_y_train, auto_X_train['St_reg']).round(4))
print("MSE: ", metrics.mean_squared_error(auto_y_train, auto_X_train['St_reg']).round(4))
print("RMSE: ", np.sqrt(metrics.mean_squared_error(auto_y_train, auto_X_train['St_reg'])).round(4))
print("MAPE: ", metrics.mean_absolute_percentage_error(auto_y_train, auto_X_train['St_reg']).round(4))
print("R2: ", metrics.r2_score(auto_y_train, auto_X_train['St_reg']).round(4))

MAE:  0.1849
MSE:  0.1205
RMSE:  0.3472
MAPE:  0.0162
R2:  0.9962


In [17]:
print("MAE: ", metrics.mean_absolute_error(auto_y_test, auto_X_test['St_reg']).round(4))
print("MSE: ", metrics.mean_squared_error(auto_y_test, auto_X_test['St_reg']).round(4))
print("RMSE: ", np.sqrt(metrics.mean_squared_error(auto_y_test, auto_X_test['St_reg'])).round(4))
print("MAPE: ", metrics.mean_absolute_percentage_error(auto_y_test, auto_X_test['St_reg']).round(4))
print("R2: ", metrics.r2_score(auto_y_test, auto_X_test['St_reg']).round(4))

MAE:  0.2918
MSE:  0.3731
RMSE:  0.6108
MAPE:  0.0254
R2:  0.9892


2.- Use Breast-cancer dataset to train several Classification models. You can choose your favourites among Bagging, Stacking and Boosting models. Train at least 2 classification models with the best hyperparameters (You can find the best ones using GridSearch)

In [18]:
breast_cancer = pd.read_csv('/content/drive/MyDrive/Ironhack/16octubre/breast-cancer.csv', sep = ';')
breast_cancer.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [19]:
breast_cancer['diagnosis'].value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

### Stacking Classifier

In [20]:
breast_cancer.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

In [21]:
X=breast_cancer.drop('diagnosis', axis=1)
y=breast_cancer['diagnosis']

In [22]:
cancer_X_train, cancer_X_test, cancer_y_train, cancer_y_test = train_test_split(X, y, train_size = 0.8, random_state = 0)
cancer_X_train.head(3)

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
338,897880,10.05,17.53,64.41,310.8,0.1007,0.07326,0.02511,0.01775,0.189,...,11.16,26.84,71.98,384.0,0.1402,0.1402,0.1055,0.06499,0.2894,0.07664
427,90745,10.8,21.98,68.79,359.9,0.08801,0.05743,0.03614,0.01404,0.2016,...,12.76,32.04,83.69,489.5,0.1303,0.1696,0.1927,0.07485,0.2965,0.07662
406,905189,16.14,14.86,104.3,800.0,0.09495,0.08501,0.055,0.04528,0.1735,...,17.71,19.58,115.9,947.9,0.1206,0.1722,0.231,0.1129,0.2778,0.07012


In [23]:
level0 = list()
level0.append(('lr', LogisticRegression()))
level0.append(('RF', RandomForestClassifier()))
level0.append(('svc', SVC()))

level1 = LogisticRegression()

# define the stacking ensemble
St_Cl = StackingClassifier(estimators = level0, final_estimator = level1)

# fit the model on all available data
St_Cl.fit(cancer_X_train, cancer_y_train)

In [24]:
print("Train set score (Accuracy) =", St_Cl.score(cancer_X_train, cancer_y_train).round(4))
print("Test set score (Accuracy) =", St_Cl.score(cancer_X_test, cancer_y_test).round(4))

conf_mat = confusion_matrix(cancer_y_test, St_Cl.predict(cancer_X_test))
print(tabulate(conf_mat,headers = ['pred diagnosis B','pred diagnosis M'], showindex = ['real diagnosis B','real diagnosis M'],
               tablefmt = 'fancy_grid'))

print(classification_report(cancer_y_test, St_Cl.predict(cancer_X_test)))

Train set score (Accuracy) = 1.0
Test set score (Accuracy) = 0.9561
╒══════════════════╤════════════════════╤════════════════════╕
│                  │   pred diagnosis B │   pred diagnosis M │
╞══════════════════╪════════════════════╪════════════════════╡
│ real diagnosis B │                 64 │                  3 │
├──────────────────┼────────────────────┼────────────────────┤
│ real diagnosis M │                  2 │                 45 │
╘══════════════════╧════════════════════╧════════════════════╛
              precision    recall  f1-score   support

           B       0.97      0.96      0.96        67
           M       0.94      0.96      0.95        47

    accuracy                           0.96       114
   macro avg       0.95      0.96      0.95       114
weighted avg       0.96      0.96      0.96       114



### XGBoost Classifier

In [25]:
XG_cl = GradientBoostingClassifier()

# define the grid of values to search
grid = dict()
grid['n_estimators'] = [50, 100]
grid['learning_rate'] = [0.01, 0.1, 1.0]
grid['max_depth'] = [3, 5, 8]

# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)

# define the grid search procedure
grid_search = GridSearchCV(estimator=XG_cl, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy')
# execute the grid search

grid_result = grid_search.fit(cancer_X_train, cancer_y_train)

# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.956044 using {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
0.938462 (0.020382) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
0.942125 (0.019019) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
0.928205 (0.014942) with: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 50}
0.930403 (0.020721) with: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}
0.926007 (0.017704) with: {'learning_rate': 0.01, 'max_depth': 8, 'n_estimators': 50}
0.927473 (0.020382) with: {'learning_rate': 0.01, 'max_depth': 8, 'n_estimators': 100}
0.950916 (0.016479) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
0.956044 (0.016544) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
0.936264 (0.021683) with: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50}
0.930403 (0.025901) with: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
0.923077 (0.024735) with: {'learning_rate': 0.1, 'max_dept

In [26]:
XG_cl = GradientBoostingClassifier(learning_rate = 0.01, max_depth = 8, n_estimators = 100, subsample = 0.7)
XG_cl.fit(cancer_X_train, cancer_y_train)

In [27]:
print("Train set score (Accuracy) =", XG_cl.score(cancer_X_train, cancer_y_train).round(4))
print("Test set score (Accuracy) =", XG_cl.score(cancer_X_test, cancer_y_test).round(4))

conf_mat = confusion_matrix(cancer_y_test, XG_cl.predict(cancer_X_test))
print(tabulate(conf_mat,headers = ['pred Diagnosis B','pred Diagnosis M'], showindex = ['real Diagnosis B','real Diagnosis M'],
               tablefmt = 'fancy_grid'))

print(classification_report(cancer_y_test, XG_cl.predict(cancer_X_test)))

cancer_X_test['XG_cl'] = XG_cl.predict(cancer_X_test)
cancer_X_train['XG_cl'] = XG_cl.predict(cancer_X_train)

Train set score (Accuracy) = 1.0
Test set score (Accuracy) = 0.9737
╒══════════════════╤════════════════════╤════════════════════╕
│                  │   pred Diagnosis B │   pred Diagnosis M │
╞══════════════════╪════════════════════╪════════════════════╡
│ real Diagnosis B │                 66 │                  1 │
├──────────────────┼────────────────────┼────────────────────┤
│ real Diagnosis M │                  2 │                 45 │
╘══════════════════╧════════════════════╧════════════════════╛
              precision    recall  f1-score   support

           B       0.97      0.99      0.98        67
           M       0.98      0.96      0.97        47

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



## Exercises SVM_Models

3.- Using the code and the datasets of the theory notebooks (SVM_Models.ipynb) and **Grid Search**, find the best hyperparameters for the SVM and for the SVR models

In [28]:
# Data Manipulation
import pandas as pd # for data manipulation
import numpy as np # for data manipulation

# Sklearn
from sklearn.linear_model import LinearRegression # for building a linear regression model
from sklearn.svm import SVR, SVC # for building SVR model
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.model_selection import RepeatedKFold, RepeatedStratifiedKFold, GridSearchCV, cross_val_score
from sklearn.model_selection import train_test_split

# Visualizations
import plotly.graph_objects as go # for data visualization
import plotly.express as px # for data visualization

import warnings
warnings.filterwarnings("ignore")

In [29]:
df = pd.read_csv('/content/drive/MyDrive/Ironhack/16octubre/Real estate.csv', encoding='utf-8')

In [30]:
X_train, X_test, y_train, y_test = train_test_split(df[['X3 distance to the nearest MRT station']],
                                                    df['Y house price of unit area'], train_size = 0.8, random_state = 0)
X_train.head(3)

Unnamed: 0,X3 distance to the nearest MRT station
302,2288.011
20,2275.877
303,439.7105


In [31]:
model1 = LinearRegression()
lr = model1.fit(X_train, y_train)

In [32]:
SVR_model = SVR()

# define the grid of values to search
grid = dict()
grid['kernel'] = ['rbf'] # ,'poly'
grid['C'] = [0.1, 1.0, 10]
grid['epsilon'] = [0.001, 0.01, 0.1, 10]
grid['gamma'] = [0.001, 0.01, 1, 10]

# define the evaluation procedure
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)

# define the grid search procedure
grid_search = GridSearchCV(estimator=SVR_model, param_grid=grid, n_jobs=-1, cv=cv, scoring='neg_mean_absolute_percentage_error')
# execute the grid search

grid_result = grid_search.fit(X_train, y_train)

# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: -0.185869 using {'C': 10, 'epsilon': 0.1, 'gamma': 0.001, 'kernel': 'rbf'}
-0.354668 (0.050076) with: {'C': 0.1, 'epsilon': 0.001, 'gamma': 0.001, 'kernel': 'rbf'}
-0.360028 (0.050554) with: {'C': 0.1, 'epsilon': 0.001, 'gamma': 0.01, 'kernel': 'rbf'}
-0.364686 (0.050812) with: {'C': 0.1, 'epsilon': 0.001, 'gamma': 1, 'kernel': 'rbf'}
-0.364997 (0.050702) with: {'C': 0.1, 'epsilon': 0.001, 'gamma': 10, 'kernel': 'rbf'}
-0.354663 (0.050065) with: {'C': 0.1, 'epsilon': 0.01, 'gamma': 0.001, 'kernel': 'rbf'}
-0.360030 (0.050557) with: {'C': 0.1, 'epsilon': 0.01, 'gamma': 0.01, 'kernel': 'rbf'}
-0.364706 (0.050820) with: {'C': 0.1, 'epsilon': 0.01, 'gamma': 1, 'kernel': 'rbf'}
-0.365023 (0.050719) with: {'C': 0.1, 'epsilon': 0.01, 'gamma': 10, 'kernel': 'rbf'}
-0.354629 (0.049998) with: {'C': 0.1, 'epsilon': 0.1, 'gamma': 0.001, 'kernel': 'rbf'}
-0.360151 (0.050480) with: {'C': 0.1, 'epsilon': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}
-0.364950 (0.050932) with: {'C': 0.1, 'epsilon': 0.1, 

In [33]:
model2 = SVR(kernel='rbf', C=10, epsilon=1, gamma = 0.001) # set kernel and hyperparameters
svr = model2.fit(X_train, y_train)




In [34]:
print("MAE: ", metrics.mean_absolute_error(y_train, model2.predict(X_train)).round(4))
print("MSE: ", metrics.mean_squared_error(y_train, model2.predict(X_train)).round(4))
print("RMSE: ", np.sqrt(metrics.mean_squared_error(y_train, model2.predict(X_train))).round(4))
print("MAPE: ", metrics.mean_absolute_percentage_error(y_train, model2.predict(X_train)).round(4))
print("R2: ", metrics.r2_score(y_train, model2.predict(X_train)).round(4))

MAE:  4.6423
MSE:  62.4807
RMSE:  7.9045
MAPE:  0.1356
R2:  0.6667


In [35]:
print("MAE: ", metrics.mean_absolute_error(y_test, model2.predict(X_test)).round(4))
print("MSE: ", metrics.mean_squared_error(y_test, model2.predict(X_test)).round(4))
print("RMSE: ", np.sqrt(metrics.mean_squared_error(y_test, model2.predict(X_test))).round(4))
print("MAPE: ", metrics.mean_absolute_percentage_error(y_test, model2.predict(X_test)).round(4))
print("R2: ", metrics.r2_score(y_test, model2.predict(X_test)).round(4))

MAE:  5.6204
MSE:  62.5867
RMSE:  7.9112
MAPE:  0.1722
R2:  0.6397


In [36]:
def plot(df, model1, model2):
  # Create a scatter plot
  fig = px.scatter(df, x=df['X3 distance to the nearest MRT station'], y=df['Y house price of unit area'],
                  opacity=0.8, color_discrete_sequence=['black'])
  X = df['X3 distance to the nearest MRT station']
  # Predict y values for our set of X values
  x_range = np.linspace(X.min(), X.max(), 100)
  y_lr = model1.predict(x_range.reshape(-1, 1)) # Linear regression
  y_svr = model2.predict(x_range.reshape(-1, 1)) # SVR

  # Add a best-fit line
  fig.add_traces(go.Scatter(x=x_range, y=y_lr, name='Linear Regression', line=dict(color='limegreen')))
  fig.add_traces(go.Scatter(x=x_range, y=y_svr, name='Support Vector Regression', line=dict(color='red')))
  fig.add_traces(go.Scatter(x=x_range, y=y_svr+10, name='+epsilon', line=dict(color='red', dash='dot')))
  fig.add_traces(go.Scatter(x=x_range, y=y_svr-10, name='-epsilon', line=dict(color='red', dash='dot')))

  # Change chart background color
  fig.update_layout(dict(plot_bgcolor = 'white'))

  # Update axes lines
  fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey',
                  zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey',
                  showline=True, linewidth=1, linecolor='black')

  fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey',
                  zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey',
                  showline=True, linewidth=1, linecolor='black')

  # Set figure title
  fig.update_layout(title=dict(text="House Price Based on Distance from the Nearest MRT",
                              font=dict(color='black')))
  # Update marker size
  fig.update_traces(marker=dict(size=3))

  fig.show()

plot(df, model1, model2)

Ejercicio SVC

In [37]:
df0 = pd.read_csv('/content/drive/MyDrive/Ironhack/16octubre/games.csv', sep = ';')
df0['rating_difference']=df0['white_rating']-df0['black_rating']
df0['white_win']=df0['winner'].apply(lambda x: 1 if x=='white' else 0)
df = df0[['rating_difference', 'turns','white_win']]
df.head(3)

Unnamed: 0,rating_difference,turns,white_win
0,309,13,1
1,61,16,0
2,-4,61,1


In [38]:
X_games=df[['rating_difference', 'turns']]#
y_games=df['white_win']

In [39]:
X_train_games, X_test_games, y_train_games, y_test_games = train_test_split(X_games, y_games, test_size=0.2, random_state=0)
X_train_games.head(3)

Unnamed: 0,rating_difference,turns
17843,-678,86
5518,-42,78
7699,61,100


In [40]:
RF_Cl = SVC()

# define the grid of values to search
grid = dict()
grid['C'] = [0.1]
grid['gamma'] = [0.001, 0.1]

# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)

# define the grid search procedure
grid_search = GridSearchCV(estimator=RF_Cl, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy')

# execute the grid search
grid_result = grid_search.fit(X_train_games, y_train_games)

# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.642756 using {'C': 0.1, 'gamma': 0.001}
0.642756 (0.006805) with: {'C': 0.1, 'gamma': 0.001}
0.504238 (0.002622) with: {'C': 0.1, 'gamma': 0.1}


In [41]:
RF_Cl = SVC(kernel='rbf', C=0.1, gamma = 0.001, probability = True)
RF_Cl.fit(X_train_games, y_train_games)

In [42]:
def Plot_3D(X, X_test, y_test, clf):

    # Specify a size of the mesh to be used
    mesh_size = 5
    margin = 1

    # # Create a mesh grid on which we will run our model
    x_min, x_max = X.iloc[:, 0].fillna(X.mean()).min() - margin, X.iloc[:, 0].fillna(X.mean()).max() + margin
    y_min, y_max = X.iloc[:, 1].fillna(X.mean()).min() - margin, X.iloc[:, 1].fillna(X.mean()).max() + margin
    xrange = np.arange(x_min, x_max, mesh_size)
    yrange = np.arange(y_min, y_max, mesh_size)
    xx, yy = np.meshgrid(xrange, yrange)

    # Calculate predictions on grid
    Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    Z = Z.reshape(xx.shape)

    # Create a 3D scatter plot with predictions
    fig = px.scatter_3d(x=X_test['rating_difference'], y=X_test['turns'], z=y_test,
                     opacity=0.8, color_discrete_sequence=['black'])

    # Set figure title and colors
    fig.update_layout(#title_text="Scatter 3D Plot with SVM Prediction Surface",
                      paper_bgcolor = 'white',
                      scene = dict(xaxis=dict(backgroundcolor='white',
                                              color='black',
                                              gridcolor='#f0f0f0'),
                                   yaxis=dict(backgroundcolor='white',
                                              color='black',
                                              gridcolor='#f0f0f0'
                                              ),
                                   zaxis=dict(backgroundcolor='lightgrey',
                                              color='black',
                                              gridcolor='#f0f0f0',
                                              )))
    # Update marker size
    fig.update_traces(marker=dict(size=1))

    # Add prediction plane
    fig.add_traces(go.Surface(x=xrange, y=yrange, z=Z, name='SVM Prediction',
                              colorscale='RdBu', showscale=False,
                              contours = {"z": {"show": True, "start": 0.2, "end": 0.8, "size": 0.05}}))
    fig.show()

In [43]:
Plot_3D(X_games, X_test_games, y_test_games, RF_Cl)