## Model 1 Linear Regression (from linear regression class module)

In [None]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [None]:
# Ignore Warnings
import warnings
warnings.filterwarnings("ignore")


In [None]:
# data preparation, enter in our csv file, or json if we go that route
df = pd.read_csv(Path('./_________'))
df.head()

In [None]:
# change the variables/features for our data

plt.scatter(df.YearsExperience, df.Salary)
plt.xlabel('Years of Experience')
plt.ylabel('Salary in USD')
plt.show()

In [None]:
# change the variables/features for our data

X = df.YearsExperience.values.reshape(-1, 1)

In [None]:
X[:5]

In [None]:
X.shape

In [None]:
# change the variables/features for our data

y = df.Salary

In [None]:
model = LinearRegression()

In [None]:
y_pred = model.predict(X)
print(y_pred.shape)

In [None]:
plt.scatter(X, y)
plt.plot(X, y_pred, color='red')
plt.show()

In [None]:
print(model.coef_)
print(model.intercept_)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.datasets import make_blobs
X, y = make_blobs(centers=2, random_state=42)

print(f"Labels: {y[:10]}")
print(f"Data: {X[:10]}")

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1, stratify=y)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

In [None]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
   intercept_scaling=1, max_iter=100, multi_class='warn', penalty='12',
   random_state=1, solver='lbfgs', tol=0.0001, warm_start=False)

In [None]:
classifier.fit(X_train, y_train)

In [None]:
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

In [None]:
import numpy as np
new_data = np.array([[-2, 6]])
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.scatter(new_data[0, 0], new_data[0, 1], c="r", marker="o", s=100)
plt.show()

In [None]:
predictions = classifier.predict(new_data)
print("Classes are either 0 (purple) or 1 (yellow)")
print(f"The new point was classified as: {predictions}")

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

## Model 2 - Random Forest Regression (based on challenge 17 homework)

In [None]:
pip install imbalanced-learn==0.9.0

In [None]:
pip install scikit-learn==1.0

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
# figure out why i'm getting an attribute error.......................................figured it out, had to pip install two items on top, per ask bcs.

from imblearn.ensemble import BalancedRandomForestClassifier
rf_model = BalancedRandomForestClassifier(n_estimators =100, random_state=1)
rf_model.fit(X_train, y_train)


In [None]:
# Calculate predictions

y_pred = rf_model.predict(X_test)
y_pred

In [None]:
# Calculated the balanced accuracy score

from sklearn.metrics import confusion_matrix, accuracy_score
balanced_accuracy_score(y_test,y_pred)

In [None]:
# Display the confusion matrix

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report

print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance

importances = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
for importance in importances:
    print(f'{importance[1]}:  {importance[0]*100:.1f}%')

## Model 3 - XGBoost Regression (Gradient Boost) (didn't cover gradient boost in class, maybe only have 2 models....?)

In [None]:
## Reference for random search on xgboost
## https://gist.github.com/wrwr/3f6b66bf4ee01bf48be965f60d14454d
tuned_params = {'max_depth': [1, 2, 3, 4, 5], 'learning_rate': [0.01, 0.05, 0.1], 'n_estimators': [100, 200, 300, 400, 500], 'reg_lambda': [0.001, 0.1, 1.0, 10.0, 100.0]}
model = RandomizedSearchCV(XGBRegressor(), tuned_params, n_iter=20, scoring = 'neg_mean_absolute_error', cv=5, n_jobs=-1)
model.fit(X_train, y_train)

In [None]:
model.best_estimator_


In [None]:
## Predict Train results
y_train_pred = model.predict(X_train)


In [None]:
## Predict Test results
y_pred = model.predict(X_test)


In [None]:

print("Train Results for XGBoost Regression:")
print("*******************************")
print("Root mean squared error: ", sqrt(mse(y_train.values, y_train_pred)))
print("R-squared: ", rs(y_train.values, y_train_pred))
print("Mean Absolute Error: ", mae(y_train.values, y_train_pred))


In [None]:

print("Test Results for XGBoost Regression:")
print("*******************************")
print("Root mean squared error: ", sqrt(mse(y_test, y_pred)))
print("R-squared: ", r2_score(y_test, y_pred))
print("Mean Absolute Error: ", mae(y_test, y_pred))


In [None]:
# Feature importance, again, not sure if we'll need this part...

## Building the model again with the best hyperparameters
model = XGBRegressor(max_depth=2,learning_rate=0.05,n_estimators=400, reg_lambda=0.001)
model.fit(X_train, y_train)



In [None]:
## Function to include figsize parameter
## Reference: https://stackoverflow.com/questions/40081888/xgboost-plot-importance-figure-size
def my_plot_importance(booster, figsize, **kwargs): 
    from matplotlib import pyplot as plt
    from xgboost import plot_importance
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax, **kwargs)

In [None]:
#not sure if we want a feature importance horizontal bar plot
my_plot_importance(model, (10,10))

