# Load & Split

In [None]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Importing the dataset
dataset = pd.read_csv('dataset.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 1].values

In [None]:
from sklearn.cross_validation import train_test_split
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)
X_train.reset_index(inplace = True, drop = True)
y_train.reset_index(inplace = True, drop = True)
X_test.reset_index(inplace = True, drop = True)
y_test.reset_index(inplace = True, drop = True)

# Preprocessing

## Boxplot all features same plot

In [None]:
plt.figure(figsize=(10,10))
ax = sns.boxplot(data=df, orient="h", palette="Set2")

## Outliers ,...


## Scaling

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

def scale_features(X_train, X_test, type='standard'):
    """
    Scale features using various scalers.

    Parameters:
    - X_train (pd.DataFrame): Features of the training set.
    - X_test (pd.DataFrame): Features of the test set.
    - type (str): Type of scaler ('standard', 'robust', 'minmax').

    Returns:
    - tuple: Scaled X_train and X_test.
    """
    if type == 'standard':
        sc_X = StandardScaler()
        X_train_scaled = sc_X.fit_transform(X_train)
        X_test_scaled = sc_X.transform(X_test)
    elif type == 'robust':
        sc_X = RobustScaler()
        X_train_scaled = sc_X.fit_transform(X_train)
        X_test_scaled = sc_X.transform(X_test)
    elif type == 'minmax':
        sc_X = MinMaxScaler()
        X_train_scaled = sc_X.fit_transform(X_train)
        X_test_scaled = sc_X.transform(X_test)
    else:
        raise ValueError("Invalid scaling type. Use 'standard', 'robust', or 'minmax'.")

    return X_train_scaled, X_test_scaled


## Encoding

### One-hot-encoding

### Replace

### Label encoder

In [None]:
# Encode "Extracurricular Activities" using sklearn LabelEncoder
from sklearn.preprocessing import LabelEncoder
lb_make = LabelEncoder()
X_train["Extracurricular Activities"] = lb_make.fit_transform(X_train["Extracurricular Activities"])
X_test["Extracurricular Activities"] = lb_make.transform(X_test["Extracurricular Activities"])

# Regression

## Learning curve

In [None]:
from sklearn.metrics import mean_squared_error
# from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

plt.figure(figsize=(10, 10))

def plot_learning_curves(model, X_train, y_train, X_test, y_test, model_type, c):

  train_errors, test_errors = [], []

  for m in range(1, len(X_train)):

    model.fit(X_train[:m], y_train[:m])
    y_train_predict = model.predict(X_train[:m])
    y_test_predict = model.predict(X_test)

    train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
    test_errors.append(mean_squared_error(y_test, y_test_predict))

  plt.plot(np.sqrt(train_errors), 'b', linewidth=2, label="train_"+model_type)
  plt.plot(np.sqrt(test_errors), 'g' , linewidth=3, label="test_"+model_type)

ridge_reg = Ridge(alpha=0.01, solver="cholesky")
plot_learning_curves(ridge_reg, X_train_poly, y_train, X_test_poly, y_test, "Ridge", 'b')

plt.xlabel('Training set size')
plt.ylabel('RMSE')
plt.title('RIDGE Regression, learning curve (alpha=0.01)')
plt.legend()

# Usage example
# --------------
# plt.figure(figsize=(10, 10))
# lin_reg = LinearRegression()
# plot_learning_curves(lin_reg, X_train_poly, y_train, X_test_poly, y_test, "LinearRegression", 'g')
# plt.title('Linear Regression, learning curve')
# plt.xlabel('Training set size')
# plt.ylabel('RMSE')
# plt.ylim((0,80))
# plt.legend()

## Linear reg

In [None]:

#feature scalining is done already by the library
# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)"""


#fitting linear model on the training set
from sklearn.linear_model import LinearRegression
def simple_lin_reg(x_train, y_train):
    regressor = LinearRegression()    
    regressor.fit(X_train, y_train)  
    return regressor

#predecting the test result
theta_best_svd, residuals, rank, s = np.linalg.lstsq(X_b, y, rcond=1e-6)
# vector of predictions of y 
# y_pred = regressor.predict(X_test)       
y_pred = regressor.predict(theta_best_svd)       

print("test RMSE={}".format(np.sqrt(mean_squared_error(y_test, y_predict))))
print("test R2={}".format(r2_score(y_test, y_predict)))

print("train RMSE={}".format(np.sqrt(mean_squared_error(y_train, regressor.predict(X_train_poly)))))
print("train R2={}".format(r2_score(y_train, lin_reg.predict(X_train_poly))))

#visualizing the training set results
plt.scatter(X_train, y_train, color = 'red')
plt.plot(X_train, regressor.predict(X_train), color = 'blue')
plt.title('Salary vs Experience (trainings set)')
plt.xlabel ('Year of experience')
plt.ylabel ('Salary')
plt.show()   

#visualizing the test set results
plt.scatter(X_test, y_test, color = 'red')
plt.plot(X_train, regressor.predict(X_train), color = 'blue')
plt.title('Salary vs Experience (test set)')
plt.xlabel ('Year of experience')
plt.ylabel ('Salary')
plt.show()        

## Gradient descent

In [None]:
# search on different n and find best accuracy
eta_values = np.round(np.arange(0,0.8,0.05),2)# learning rate
n_iterations = 100

r2_scores = []
rmse_test_GD, rmse_train_GD = [], []
m=X_train.shape[0]

for eta in eta_values:
  print("\neta = {}".format(eta))
  theta = np.random.randn(X_train.shape[1]+1,1).reshape(1,-1)[0]  # coefficients random initialization

  for iteration in range(n_iterations):
    gradients = (2/m) * X_b_train.T.dot(X_b_train.dot(theta) - y_train)
    theta = theta - eta * gradients
    X_b_test = np.c_[np.ones((len(X_test), 1)), X_test] # add x0 = 1 to each instance
    y_pred_test = X_b_test.dot(theta)
    y_pred_train = X_b_train.dot(theta)

  rmse_test_GD.append(np.sqrt(mean_squared_error(y_test, y_pred_test)))
  r2_scores.append(r2_score(y_test, y_pred_test))
  rmse_train_GD.append(np.sqrt(mean_squared_error(y_train, y_pred_train)))

  print("test RMSE={}".format(mean_squared_error(y_test, y_pred_test, squared=False)))
  print("test R2={}".format(r2_score(y_test, y_pred_test)))
  print("train RMSE={}".format(mean_squared_error(y_train, y_pred_train, squared=False)))
  print("train R2={}".format(r2_score(y_train, y_pred_train)))

best_eta = eta_values[np.argmin(rmse_test_GD)]
plt.plot(eta_values,rmse_test_GD,'r--+')
plt.title(f"RMSE VS eta - best eta: {best_eta}")
plt.ylabel("RMSE")
plt.xlabel("eta")
plt.grid('on')

In [None]:
# plot learning curve after finding best n value
eta = best_eta # learning rate
n_iterations = 100
m=X_train.shape[0]

rmse_train, rmse_test = [], []

# coefficients random initialization
# Add a column of 1s to X_train to represent the intercept term
theta = np.random.randn(X_train.shape[1]+1,1).reshape(1,-1)[0]

for iteration in range(n_iterations):
  # Calculate the predictions
  y_predict = X_b_train.dot(theta)
  # Calculate the gradients (derivative of MSE)
  gradients = 2/m * X_b_train.T.dot(y_predict - y_train)

  theta = theta - eta * gradients

  y_pred_test = X_b_test.dot(theta)
  y_pred_train = X_b_train.dot(theta)

  rmse_test.append(np.sqrt(mean_squared_error(y_test, y_pred_test)))
  rmse_train.append(np.sqrt(mean_squared_error(y_train, y_pred_train)))

plt.plot(range(1, 1+n_iterations), rmse_test, label='Test Data RMSE', color='blue')
plt.plot(range(1, 1+n_iterations), rmse_train, label='Train Data RMSE', color='red')
plt.xlabel('# iterations')
plt.ylabel('RMSE')
plt.title('RMSE Comparison')
plt.grid(True)
plt.legend()
plt.grid(True)
plt.show()


print("test RMSE={}".format(mean_squared_error(y_test, y_pred_test, squared=False)))
print("test R2={}".format(r2_score(y_test, y_pred_test)))

## Batch GD

## Stochastic GD

find best n

In [None]:
# manual implementation
eta = 0.15
n_epochs = 100
m = X_train.shape[0]
theta = np.random.randn(X_train.shape[1]+1,1).reshape(1,-1)[0]
rmse_test_SGD, rmse_train_SGD = [], []
y_pred_test, y_pred_train = [], []

for epoch in range(n_epochs):
  # Generate a random permutation index
  permutation = np.random.permutation(m)
  # Shuffle consistently the training set and the target (apply the same permutation)
  X_b_train_shuffled = X_b_train[permutation]
  y_train_shuffled = y_train[permutation]

  # Iterate over training set samples
  for i in range(m):
    xi = X_b_train_shuffled[i]
    yi = y_train_shuffled[i]
    gradients = 2 * xi.T.dot(xi.dot(theta)-yi)
    theta = theta - eta * gradients

 # Calculate predictions at each epoch
  y_pred_test = X_b_test.dot(theta)
  y_pred_train = X_b_train_shuffled.dot(theta)
  # Calculate RMSE at each epoch
  rmse_test_SGD.append(mean_squared_error(y_test, y_pred_test, squared = False))
  rmse_train_SGD.append(mean_squared_error(y_train_shuffled, y_pred_train, squared = False))


Learning curve

In [None]:
plt.plot(range(1, 1+n_epochs), rmse_test_SGD, label='Test Data RMSE', color='blue')
plt.plot(range(1, 1+n_epochs), rmse_train_SGD, label='Train Data RMSE', color='red')
plt.xlabel('# epochs')
plt.ylabel('RMSE')
plt.title('RMSE Comparison')
plt.grid(True)
plt.legend()
plt.grid(True)
plt.show()


print("test RMSE={}".format(mean_squared_error(y_test, y_pred_test, squared=False)))
print("test R2={}".format(r2_score(y_test, y_pred_test)))

## Polynomial reg

In [None]:
#fitting polinomial regression 
from sklearn.preprocessing import PolynomialFeatures

poly_reg =  PolynomialFeatures(degree = 4)
X_poli = poly_reg.fit_transform(X)
lin_reg_2 = LinearRegression()
lin_reg_2.fit(X_poli, y)

#visualize linear regression results
plt.scatter(X, y, color= 'red')
plt.plot(X, lin_reg.predict(X), color = 'blue')
plt.title('Reality vs Bluff(linear regression)')
plt.xlabel('Position label')
plt.ylabel('Salary')
plt.show()

#visualize polinomial regression results
plt.scatter(X, y, color= 'red')
plt.plot(X, lin_reg_2.predict(poly_reg.fit_transform(X)), color = 'blue')
plt.title('Reality vs Bluff(polynomial regression)')
plt.xlabel('Position label')
plt.ylabel('Salary')
plt.show()

#Predicting a new result with Linear Regression
lin_reg.predict(6.5)

#Predicting a new result with Polinomial regression
lin_reg_2.predict(poly_reg.fit_transform(6.5))

## Ridge reg

In [None]:
from sklearn.linear_model import Ridge

alphas=[0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.08, 0.1, 0.2]

rmse_values=[]

for alpha in alphas:
  ridge_model = Ridge(alpha=alpha)
  ridge_model.fit(X_train_poly, y_train)
  y_predict=ridge_model.predict(X_test_poly)
  rmse_values.append(np.sqrt(mean_squared_error(y_test, y_predict)))

# plt.plot(alphas, rmse_values)
# plt.xlabel('alpha')
# plt.ylabel("RMSE")

# for i, j in zip(alphas, rmse_values):
#     print('Alpha = {}, RMSE = {}'.format(i, j))

# print("Minimum test-RMSE = {}".format(np.min(rmse_values)))

# Plot RMSE values against alpha values
plt.plot(alphas, rmse_values, marker='o')
plt.xscale('log')  # Use a log scale for better visualization
plt.xlabel('Alpha')
plt.ylabel('RMSE')
plt.title('RMSE vs Alpha (log scale)')
plt.show()

# Find the minimum test-RMSE and corresponding alpha
min_rmse_index = np.argmin(rmse_values)
min_rmse = rmse_values[min_rmse_index]
optimal_alpha = alphas[min_rmse_index]

print(f'\nMinimum Test-RMSE: {np.round(min_rmse,4)} at Alpha: {optimal_alpha}')

## Lasso reg

In [None]:
from sklearn.linear_model import Lasso

alphas=[0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.08, 0.1, 0.2]
rmse_values=[]

for alpha in alphas:
  lasso_model = Lasso(alpha=alpha)
  lasso_model.fit(X_train, y_train)
  y_predict=lasso_model.predict(X_test)
  rmse_values.append(np.sqrt(mean_squared_error(y_test, y_predict)))

# Plot RMSE values against alpha values
plt.plot(alphas, rmse_values, marker='o')
plt.xscale('log')  # Use a log scale for better visualization
plt.xlabel('Alpha')
plt.ylabel('RMSE')
plt.title('RMSE vs Alpha (log scale)')
plt.show()

# Find the minimum test-RMSE and corresponding alpha
min_rmse_index = np.argmin(rmse_values)
min_rmse = rmse_values[min_rmse_index]
optimal_alpha = alphas[min_rmse_index]

print(f'\nMinimum Test-RMSE: {np.round(min_rmse,4)} at Alpha: {optimal_alpha}')

## Decision tree

In [None]:
#fitting decision tree regression model to the dataset
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X,y)


#Predicting a new result with decision tree regression
y_pred = regressor.predict(6.5)


#visualize polinomial regression results(for high resolution and smoother curves)
X_grid = np.arange(min(X), max(X), 0.1)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X, y, color= 'red')
plt.plot(X_grid, regressor.predict(X_grid), color = 'blue')
plt.title('Reality vs Bluff(decision tree regression)')
plt.xlabel('Position label')
plt.ylabel('Salary')
plt.show()

## Random forest

In [None]:

#fitting Random forest regression model to the dataset
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor.fit(X,y)


#Predicting a new result with Random forest regression regression
y_pred = regressor.predict(6.5)


#visualize Random forest regression results(for high resolution and smoother curves)
X_grid = np.arange(min(X), max(X), 0.1)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X, y, color= 'red')
plt.plot(X_grid, regressor.predict(X_grid), color = 'blue')
plt.title('Reality vs Bluff(decision tree regression)')
plt.xlabel('Position label')
plt.ylabel('Salary')
plt.show()

# Classification

## Logistic regression

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from matplotlib.colors import ListedColormap

def scale_features(X_train, X_test):
    """
    Scale features using StandardScaler.

    Parameters:
    - X_train (pd.DataFrame): Features of the training set.
    - X_test (pd.DataFrame): Features of the test set.

    Returns:
    - tuple: Scaled X_train and X_test.
    """
    sc_X = StandardScaler()
    X_train_scaled = sc_X.fit_transform(X_train)
    X_test_scaled = sc_X.transform(X_test)
    return X_train_scaled, X_test_scaled

def fit_logistic_regression(X_train, y_train):
    """
    Fit Logistic Regression model to the training set.

    Parameters:
    - X_train (pd.DataFrame): Features of the training set.
    - y_train (pd.Series): Target variable of the training set.

    Returns:
    - LogisticRegression: Fitted Logistic Regression model.
    """
    classifier = LogisticRegression(random_state=0)
    classifier.fit(X_train, y_train)
    return classifier

def predict_and_confusion_matrix(classifier, X_test, y_test):
    """
    Predict test set results and compute the confusion matrix.

    Parameters:
    - classifier (LogisticRegression): Fitted Logistic Regression model.
    - X_test (pd.DataFrame): Features of the test set.
    - y_test (pd.Series): Target variable of the test set.

    Returns:
    - tuple: y_pred (predicted values), cm (confusion matrix).
    """
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    return y_pred, cm

def plot_decision_boundary(X_set, y_set, classifier, title):
    """
    Plot the decision boundary for the logistic regression model.

    Parameters:
    - X_set (pd.DataFrame): Features of the dataset.
    - y_set (pd.Series): Target variable of the dataset.
    - classifier (LogisticRegression): Fitted Logistic Regression model.
    - title (str): Plot title.
    """
    feature_count = X_set.shape[1]
    if feature_count == 2:
        X1, X2 = np.meshgrid(
            np.arange(start=X_set[:, 0].min() - 1, stop=X_set[:, 0].max() + 1, step=0.01),
            np.arange(start=X_set[:, 1].min() - 1, stop=X_set[:, 1].max() + 1, step=0.01)
        )
        plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
                     alpha=0.75, cmap=ListedColormap(('red', 'green')))
        plt.xlim(X1.min(), X2.max())
        plt.ylim(X2.min(), X1.max())
        for i, j in enumerate(np.unique(y_set)):
            plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                        c=ListedColormap(('red', 'green'))(i), label=j)
        plt.title(title)
        plt.xlabel('Feature 1')
        plt.ylabel('Feature 2')
        plt.show()
    else:
        print("Plotting is supported for 2 features only.")

def visualize_results(X_train, y_train, X_test, y_test, classifier):
    """
    Visualize the training and test set results.

    Parameters:
    - X_train (pd.DataFrame): Features of the training set.
    - y_train (pd.Series): Target variable of the training set.
    - X_test (pd.DataFrame): Features of the test set.
    - y_test (pd.Series): Target variable of the test set.
    - classifier (LogisticRegression): Fitted Logistic Regression model.
    """
    # Visualizing the training set results
    plot_decision_boundary(X_train, y_train, classifier, title='Logistic Regression (Training Set)')

    # Visualizing the test set results
    plot_decision_boundary(X_test, y_test, classifier, title='Logistic Regression (Test Set)')


## Native Bayes

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from matplotlib.colors import ListedColormap

def scale_features(X_train, X_test):
    """
    Scale features using StandardScaler.

    Parameters:
    - X_train (pd.DataFrame): Features of the training set.
    - X_test (pd.DataFrame): Features of the test set.

    Returns:
    - tuple: Scaled X_train and X_test.
    """
    sc_X = StandardScaler()
    X_train_scaled = sc_X.fit_transform(X_train)
    X_test_scaled = sc_X.transform(X_test)
    return X_train_scaled, X_test_scaled

def fit_naive_bayes_classifier(X_train, y_train):
    """
    Fit Naive Bayes Classifier to the training set.

    Parameters:
    - X_train (pd.DataFrame): Features of the training set.
    - y_train (pd.Series): Target variable of the training set.

    Returns:
    - GaussianNB: Fitted Naive Bayes Classifier.
    """
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    return classifier

def predict_and_confusion_matrix(classifier, X_test, y_test):
    """
    Predict test set results and compute the confusion matrix.

    Parameters:
    - classifier (GaussianNB): Fitted Naive Bayes Classifier.
    - X_test (pd.DataFrame): Features of the test set.
    - y_test (pd.Series): Target variable of the test set.

    Returns:
    - tuple: y_pred (predicted values), cm (confusion matrix).
    """
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    return y_pred, cm

def plot_decision_boundary(X_set, y_set, classifier, title):
    """
    Plot the decision boundary for the Naive Bayes Classifier.

    Parameters:
    - X_set (pd.DataFrame): Features of the dataset.
    - y_set (pd.Series): Target variable of the dataset.
    - classifier (GaussianNB): Fitted Naive Bayes Classifier.
    - title (str): Plot title.
    """
    feature_count = X_set.shape[1]
    if feature_count == 2:
        X1, X2 = np.meshgrid(
            np.arange(start=X_set[:, 0].min() - 1, stop=X_set[:, 0].max() + 1, step=0.01),
            np.arange(start=X_set[:, 1].min() - 1, stop=X_set[:, 1].max() + 1, step=0.01)
        )
        plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
                     alpha=0.75, cmap=ListedColormap(('red', 'green')))
        plt.xlim(X1.min(), X2.max())
        plt.ylim(X2.min(), X1.max())
        for i, j in enumerate(np.unique(y_set)):
            plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                        c=ListedColormap(('red', 'green'))(i), label=j)
        plt.title(title)
        plt.xlabel('Feature 1')
        plt.ylabel('Feature 2')
        plt.show()
    else:
        print("Plotting is supported for 2 features only.")

def visualize_results(X_train, y_train, X_test, y_test, classifier):
    """
    Visualize the training and test set results for Naive Bayes Classifier.

    Parameters:
    - X_train (pd.DataFrame): Features of the training set.
    - y_train (pd.Series): Target variable of the training set.
    - X_test (pd.DataFrame): Features of the test set.
    - y_test (pd.Series): Target variable of the test set.
    - classifier (GaussianNB): Fitted Naive Bayes Classifier.
    """
    # Visualizing the training set results
    plot_decision_boundary(X_train, y_train, classifier, title='Naive Bayes (Training Set)')

    # Visualizing the test set results
    plot_decision_boundary(X_test, y_test, classifier, title='Naive Bayes (Test Set)')


## KNN

In [None]:
#fitting KNN classifier to the training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5, metric = 'minkowski', p=2)
classifier.fit(X_train, y_train)


#predict test set result
y_pred = classifier.predict(X_test)

#making the confusion matrix
from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(y_test, y_pred)

#visualizing the training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:,0].min() - 1, stop = X_set[:,0].max() + 1, step=0.01), 
                     np.arange(start = X_set[:,1].min() - 1, stop = X_set[:,1].max() + 1, step=0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), 
             alpha = 0.75, cmap= ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X2.max())
plt.ylim(X2.min(), X1.max())
for i,j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set==j, 0], X_set[y_set==j, 1], 
                c= ListedColormap(('red', 'green'))(i), label= j)

plt.title('K-NN  (training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.show()


#visualizing the test set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:,0].min() - 1, stop = X_set[:,0].max() + 1, step=0.01), 
                     np.arange(start = X_set[:,1].min() - 1, stop = X_set[:,1].max() + 1, step=0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), 
             alpha = 0.75, cmap= ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X2.max())
plt.ylim(X2.min(), X1.max())
for i,j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set==j, 0], X_set[y_set==j, 1], 
                c= ListedColormap(('red', 'green'))(i), label= j)

plt.title('K-NN (test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.show()

## Decision tree

In [None]:

# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

#fitting decision tree classifier to the training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion= 'entropy', 
                                    random_state = 0)
classifier.fit(X_train, y_train)


#predict test set result
y_pred = classifier.predict(X_test)

#making the confusion matrix
from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(y_test, y_pred)

#visualizing the training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:,0].min() - 1, stop = X_set[:,0].max() + 1, step=0.01), 
                     np.arange(start = X_set[:,1].min() - 1, stop = X_set[:,1].max() + 1, step=0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), 
             alpha = 0.75, cmap= ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X2.max())
plt.ylim(X2.min(), X1.max())
for i,j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set==j, 0], X_set[y_set==j, 1], 
                c= ListedColormap(('red', 'green'))(i), label= j)

plt.title('Decision tree classifier (training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.show()


#visualizing the test set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:,0].min() - 1, stop = X_set[:,0].max() + 1, step=0.01), 
                     np.arange(start = X_set[:,1].min() - 1, stop = X_set[:,1].max() + 1, step=0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), 
             alpha = 0.75, cmap= ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X2.max())
plt.ylim(X2.min(), X1.max())
for i,j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set==j, 0], X_set[y_set==j, 1], 
                c= ListedColormap(('red', 'green'))(i), label= j)

plt.title('Decision tree classifier (test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.show()

# Plotting the tree
# In the terminal enter: pip install pydot2
from sklearn import tree
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydot
dot_data = StringIO()
tree.export_graphviz(classifier,
                     out_file = dot_data,
                     feature_names = ['Age', 'Estimated Salary'],
                     class_names = ['Yes', 'No'],
                     filled = True,
                     rounded = True,
                     special_characters = True)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

## Random forest

In [None]:

#fitting random forest classification to the training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, 
                                    criterion = 'entropy', 
                                    random_state = 0 )
classifier.fit(X_train, y_train)



#predict test set result
y_pred = classifier.predict(X_test)

#making the confusion matrix
from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(y_test, y_pred)

#visualizing the training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:,0].min() - 1, stop = X_set[:,0].max() + 1, step=0.01), 
                     np.arange(start = X_set[:,1].min() - 1, stop = X_set[:,1].max() + 1, step=0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), 
             alpha = 0.75, cmap= ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X2.max())
plt.ylim(X2.min(), X1.max())
for i,j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set==j, 0], X_set[y_set==j, 1], 
                c= ListedColormap(('red', 'green'))(i), label= j)

plt.title('Random forest classification (training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.show()


#visualizing the test set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:,0].min() - 1, stop = X_set[:,0].max() + 1, step=0.01), 
                     np.arange(start = X_set[:,1].min() - 1, stop = X_set[:,1].max() + 1, step=0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), 
             alpha = 0.75, cmap= ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X2.max())
plt.ylim(X2.min(), X1.max())
for i,j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set==j, 0], X_set[y_set==j, 1], 
                c= ListedColormap(('red', 'green'))(i), label= j)

plt.title('Random forest classification  (test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.show()