In [1]:
# import the required packages

import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import zscore

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn. ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
%matplotlib inline

## 1. HR Analytics

### 1.1 Load and Sample the data

In [2]:
# Load the dataset
hr_data = pd.read_csv('hr_data_new.csv')


# Summary of data cleaning and transformation
# (Note: You need to fill this part based on what you have done in your previous assignment)

# Checking for missing values and data types
hr_data.info()

# Basic statistical summary
hr_data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50590 entries, 0 to 50589
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   previous_year_rating          50590 non-null  int64  
 1   KPIs_met >80%                 50590 non-null  int64  
 2   awards_won?                   50590 non-null  int64  
 3   avg_training_score            50590 non-null  int64  
 4   is_promoted                   50590 non-null  int64  
 5   training_effectiveness        50590 non-null  float64
 6   training_effectiveness_level  50590 non-null  int64  
dtypes: float64(1), int64(6)
memory usage: 2.7 MB


Unnamed: 0,previous_year_rating,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,training_effectiveness,training_effectiveness_level
count,50590.0,50590.0,50590.0,50590.0,50590.0,50590.0,50590.0
mean,3.304843,0.359577,0.024056,63.496659,0.086677,56.54095,2.328266
std,1.212867,0.479881,0.153225,13.452907,0.281364,18.215224,0.952628
min,1.0,0.0,0.0,39.0,0.0,4.9,0.0
25%,3.0,0.0,0.0,51.0,0.0,47.0,2.0
50%,3.0,0.0,0.0,60.0,0.0,57.0,2.0
75%,4.0,1.0,0.0,76.0,0.0,70.0,3.0
max,5.0,1.0,1.0,99.0,1.0,99.0,4.0


In [3]:

# Display the first few rows of the dataset
hr_data.head()

Unnamed: 0,previous_year_rating,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,training_effectiveness,training_effectiveness_level
0,5,1,0,49,0,49.0,2
1,5,0,0,60,0,60.0,2
2,3,0,0,50,0,50.0,2
3,1,0,0,50,0,25.0,1
4,3,0,0,73,0,73.0,3


### 1.2 Build the Model(s)

#### 1.2.1 Simple train & test split
We start by using the simple train-test split which we have been using in the past several weeks.

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier


hr_X = hr_data.drop('is_promoted', axis=1)
hr_Y = hr_data['is_promoted']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(hr_X, hr_Y, test_size=0.2, random_state=4)


In [5]:
# Logistic Regression Model
log_reg = LogisticRegression(solver='lbfgs',max_iter=10000)
log_reg.fit(X_train, y_train)


In [6]:
# Decision Tree Model
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)


#### 1.2.2 K-folds Cross Validation

In [7]:
# Evaluate the Model using K-folds Cross validation
results  = cross_validate(log_reg, hr_X, hr_Y, scoring='accuracy', cv=5, return_train_score = True)
print('train_score: ', results['train_score'])
print('test_score: ', results['test_score'])

train_score:  [0.9148053  0.91554655 0.91522534 0.91470646 0.91497826]
test_score:  [0.9159913  0.91361929 0.9145088  0.91559597 0.9148053 ]


In [8]:
print('cross val training accuracy is:', sum(results['train_score'])/len(results['train_score']))
print('cross val testing accuracy is:', sum(results['test_score'])/len(results['test_score']))

cross val training accuracy is: 0.9150523818936549
cross val testing accuracy is: 0.9149041312512354


In [9]:
# Evaluate the Model using K-folds Cross validation
results  = cross_validate(tree, hr_X, hr_Y, scoring='accuracy', cv=5, return_train_score = True)
print('train_score: ', results['train_score'])
print('test_score: ', results['test_score'])

train_score:  [0.92814786 0.92891382 0.92874086 0.92812315 0.92832081]
test_score:  [0.92281083 0.92014232 0.92063649 0.92093299 0.92093299]


In [10]:
print('cross val training accuracy is:', sum(results['train_score'])/len(results['train_score']))
print('cross val testing accuracy is:', sum(results['test_score'])/len(results['test_score']))

cross val training accuracy is: 0.9284492982802925
cross val testing accuracy is: 0.9210911247282072


### 1.3 Evaluate and Improve the Model(s)

#### 1.3.1 Tune parameters and Grid Search
To improve our model, we use the grid search to try to find the best parametars, details are shown below. 

In [11]:
from sklearn.model_selection import GridSearchCV

# Logistic Regression with GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
log_reg_cv = GridSearchCV(log_reg, param_grid, cv=5)
log_reg_cv.fit(X_train, y_train)

print("Best Parameters: ", log_reg_cv.best_params_)


Best Parameters:  {'C': 0.01, 'solver': 'newton-cg'}


In [12]:
# Decision Tree with GridSearchCV
param_grid = {
    'criterion': ['gini', 'entropy'],   # Measures the quality of a split
    'max_depth': [2, 4, 6, 8],          # Maximum depth of the tree
    'min_samples_leaf': [1, 5, 10]      # Minimum number of samples required to be at a leaf node
}

# Initialize the Decision Tree Classifier
tree = DecisionTreeClassifier()

# Initialize GridSearchCV
tree_cv = GridSearchCV(tree, param_grid, cv=5)

# Fit the model
tree_cv.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters: ", tree_cv.best_params_)


Best Parameters:  {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 10}


In [13]:
from sklearn.model_selection import cross_validate

# Initialize the model with the best parameters
log_reg_final = LogisticRegression(C=0.01, solver='newton-cg')

# Fit the model to the training data
log_reg_final.fit(X_train, y_train)

# Evaluate the model on the training and testing sets
print('***Final Logistic Regression Model***')
print('Training Accuracy: ', log_reg_final.score(X_train, y_train))
print('Testing Accuracy: ', log_reg_final.score(X_test, y_test))

# Perform cross-validation
log_reg_results = cross_validate(log_reg_final, hr_X, hr_Y, scoring='accuracy', cv=5, return_train_score=True)
print('Cross-validation Training Accuracy:', sum(log_reg_results['train_score'])/len(log_reg_results['train_score']))
print('Cross-validation Testing Accuracy:', sum(log_reg_results['test_score'])/len(log_reg_results['test_score']))


***Final Logistic Regression Model***
Training Accuracy:  0.9153241747380906
Testing Accuracy:  0.9156948013441392
Cross-validation Training Accuracy: 0.9154328918758647
Cross-validation Testing Accuracy: 0.9154180668116227


In [14]:
# Initialize the model with the best parameters
tree_final = DecisionTreeClassifier(criterion='gini', max_depth=6, min_samples_leaf=1)

# Fit the model to the training data
tree_final.fit(X_train, y_train)

# Evaluate the model on the training and testing sets
print('***Final Decision Tree Model***')
print('Training Accuracy: ', tree_final.score(X_train, y_train))
print('Testing Accuracy: ', tree_final.score(X_test, y_test))

# Perform cross-validation
tree_results = cross_validate(tree_final, hr_X, hr_Y, scoring='accuracy', cv=5, return_train_score=True)
print('Cross-validation Training Accuracy:', sum(tree_results['train_score'])/len(tree_results['train_score']))
print('Cross-validation Testing Accuracy:', sum(tree_results['test_score'])/len(tree_results['test_score']))


***Final Decision Tree Model***
Training Accuracy:  0.9237991697964024
Testing Accuracy:  0.9259735125518878
Cross-validation Training Accuracy: 0.9244267641826447
Cross-validation Testing Accuracy: 0.9236805692824669


## 2. Airbnb

### 2.1 Load and Sample the data

In [15]:
# Load the dataset
listings_new = pd.read_csv('listings_new.csv')

# Basic statistical summary
listings_new.describe()

Unnamed: 0,neighbourhood,room_type,price,average_price_per_night,popularity_level
count,7907.0,7907.0,7907.0,7907.0,7907.0
mean,13.950803,0.622233,169.332996,75.78909,0.979006
std,8.499177,0.578584,340.187599,246.536902,1.082045
min,0.0,0.0,0.0,0.0,0.0
25%,6.0,0.0,65.0,11.0,0.0
50%,14.0,1.0,124.0,39.8,0.0
75%,20.0,1.0,199.0,83.333333,2.0
max,42.0,2.0,10000.0,10000.0,3.0


In [16]:
listings_new.head()

Unnamed: 0,neighbourhood,room_type,price,average_price_per_night,popularity_level
0,0,0,83,0.461111,2
1,1,0,81,0.9,0
2,0,0,69,11.5,0
3,2,0,206,206.0,0
4,2,0,94,94.0,0


In [17]:
listings_new['average_price_per_night'] = listings_new['average_price_per_night'].round(2)

### 2.2 Build the Model(s)

#### 2.2.1 Simple train & test split

In [18]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor

air_X = listings_new.drop('price', axis=1)  # Features
air_Y = listings_new['price']               # Target

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(air_X, air_Y, test_size=0.2, random_state=4)


In [19]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [25]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


# Example models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'XGBoost': xgb.XGBRegressor()
}

# Cross-validation to evaluate models
for name, model in models.items():
    mse_scores = -cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
    print(f'{name}: Average MSE: {np.mean(mse_scores)}')


Linear Regression: Average MSE: 94970.15226643399
Ridge: Average MSE: 94962.61628155565
Lasso: Average MSE: 95284.12444069472
Random Forest: Average MSE: 52626.643777249286
Gradient Boosting: Average MSE: 49976.89151543087
XGBoost: Average MSE: 76565.02999255413


In [20]:
# Build Random Forest Model and Evaluate both training and testing accuracy
rf = RandomForestRegressor(n_estimators=40)
rf.fit(X_train,y_train)

In [21]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(random_state=42)

# Train the Gradient Boosting model with the training data
gb_model.fit(X_train, y_train)


In [22]:
from sklearn.metrics import mean_squared_error
print('the training mean squared error is: ', mean_squared_error(rf.predict(X_train), y_train))
print('the testing mean squared error is: ', mean_squared_error(rf.predict(X_test), y_test))

the training mean squared error is:  8716.50530083976
the testing mean squared error is:  99673.73217141979


#### 2.2.2 K-folds Cross Validation

In [None]:
# Evaluate the Model using K-folds Cross validation
results  = cross_validate(rf, air_X, air_Y.ravel(), scoring='neg_mean_squared_error', cv=5, return_train_score = True)
print('train_score: ', results['train_score'])
print('test_score: ', results['test_score'])

In [None]:
print('cross val training mean_squared_error is:', sum(-results['train_score'])/len(results['train_score']))
print('cross val testing mean_squared_error is:', sum(-results['test_score'])/len(results['test_score']))

In [None]:
# Evaluate the Model using K-folds Cross validation
results  = cross_validate(gb_model, air_X, air_Y.ravel(), scoring='neg_mean_squared_error', cv=5, return_train_score = True)
print('train_score: ', results['train_score'])
print('test_score: ', results['test_score'])

In [None]:
print('cross val training mean_squared_error is:', sum(-results['train_score'])/len(results['train_score']))
print('cross val testing mean_squared_error is:', sum(-results['test_score'])/len(results['test_score']))

### 2.3 Evaluate and Improve the Model(s)

#### 2.3.1 Tuning Parameters  and Grid Search
To improve our model, we use the grid search to try to find the best parametars, details are shown below. 

#### Random Forest regressor

In [None]:
# Setting max_depth from 2 to 9 (8 different values)
# Training & evaluating the model 8 times with different max_depth value
# Store both simple split measures (train_mse & test_mse) and 
# cross validation measures (cross_val_train_mse & cross val_test_mse) into arrays

depth_range = range(2,10)

train_mse = []
test_mse = []
cross_val_train_mse =[]
cross_val_test_mse =[]

for d in depth_range:
    # Create Random Forest Model
    rf = RandomForestRegressor(max_depth=d,random_state=1, n_estimators=20)
    rf.fit(X_train, y_train.ravel()) 
    train_mse.append(mean_squared_error(rf.predict(X_train), y_train))
    test_mse.append(mean_squared_error(rf.predict(X_test), y_test))
    
    results  = cross_validate(rf, air_X, air_Y.ravel(), scoring='neg_mean_squared_error', cv=5, return_train_score = True)
    cross_val_train_mse.append(sum(-results['train_score'])/len(results['train_score']))
    cross_val_test_mse.append(sum(-results['test_score'])/len(results['test_score']))
    


In [None]:
# Plot the simple split measures (train_mse & test_mse) 
plt.plot(depth_range, train_mse, 'bo-', label ='training mse')
plt.plot(depth_range, test_mse, 'ro-', label = 'testing mse')

plt.xlabel('max_depth', fontsize='x-large')
plt.ylabel('mean squared error (MSE)', fontsize='x-large')

plt.legend(loc='best', shadow=True, fontsize='x-large')
plt.show()

In [None]:
# Plot the cross validation measures (cross_val_train_mse & cross val_test_mse) 
plt.plot(depth_range, cross_val_train_mse, 'bo-', label ='cross val training mse')
plt.plot(depth_range, cross_val_test_mse, 'ro-', label = 'cross val testing mse')

plt.xlabel('max_depth', fontsize='x-large')
plt.ylabel('mean squared error (MSE)', fontsize='x-large')

plt.legend(loc='best', shadow=True, fontsize='x-large')
plt.show()

In [None]:
print('***Final Model***')
rf_final = RandomForestRegressor(max_depth=8,random_state=1, n_estimators=20)
rf_final.fit(X_train, y_train.ravel()) 
print('training mse: ', mean_squared_error(rf_final.predict(X_train), y_train))
print('testing mse: ', mean_squared_error(rf_final.predict(X_test), y_test))
    
results  = cross_validate(rf_final, air_X, air_Y.ravel(), scoring='neg_mean_squared_error', cv=5, return_train_score = True)
print('cross val training mse:', sum(-results['train_score'])/len(results['train_score']))
print('cross val testing mse:', sum(-results['test_score'])/len(results['test_score']))

#### Gradient Boosting regressor

In [None]:
# Define the parameter grid for XGBoost
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}

# Initialize the XGBoost Regressor
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror')

# Perform Grid Search
xgb_grid_search = GridSearchCV(xgb_reg, xgb_param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
xgb_grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters for XGBoost Regressor: ", xgb_grid_search.best_params_)


In [None]:
# Initialize XGBoost Regressor with the best parameters
xgb_best = xgb.XGBRegressor(colsample_bytree=1.0, learning_rate=0.2, max_depth=3, n_estimators=50, subsample=1.0)

print('***Final Model***')


xgb_best.fit(X_train, y_train)

# Training and Testing MSE
print('Training MSE: ', mean_squared_error(y_train, xgb_best.predict(X_train)))
print('Testing MSE: ', mean_squared_error(y_test, xgb_best.predict(X_test)))

# Cross-validation MSE
results = cross_validate(xgb_best, air_X, air_Y, scoring='neg_mean_squared_error', cv=5, return_train_score=True)
print('Cross-validation Training MSE:', sum(-results['train_score']) / len(results['train_score']))
print('Cross-validation Testing MSE:', sum(-results['test_score']) / len(results['test_score']))
