In [None]:
!apt-get update && apt-get install -y build-essential
!pip install xgboost
!pip install shap

In [None]:
import pandas as pd
import xgboost as xgb

# Utility Functions

## Data Preparation

In [None]:
from sklearn.model_selection import train_test_split
def prepare_data(data, target):
    X = data.drop(target, axis=1)
    y = data[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    return X_train, X_test, y_train, y_test

## Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, plot_confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt

def evaluate(model, X_test, y_test):
    pred = model.predict(X_test)
    # accuracy = correct_predictions / all_predictions 
    acc = accuracy_score(y_test, pred)

    # true_positives / (true_positives + false_postives)
    # how many positive predictions were true
    prec = precision_score(y_test, pred, average='weighted')

    # true_postives / (true_positives + false_negatives)
    # how many postives out of all were identified
    rec = recall_score(y_test, pred, average='weighted')

    # harmonic mean of precision and recall
    f1 = f1_score(y_test, pred, average='weighted')
    
    print(f"accuracy: {acc}")
    print(f"precision: {prec}")
    print(f"recall: {rec}")
    print(f"f1: {f1}")
    
    try:
        prob = model.predict_proba(X_test)
        roc_auc = roc_auc_score(y_test, prob, multi_class='ovo')
        print(f"roc_auc: {roc_auc}")
    except:
        pass
    
    fig, ax = plt.subplots(figsize=(10, 10))
    plot_confusion_matrix(model, X_test, y_test, xticks_rotation='vertical', ax=ax)
    

# Binary Classification

## Data Preparation

In [None]:
heart_data = pd.read_csv('heart.csv')
heart_data

In [None]:
X_train, X_test, y_train, y_test = prepare_data(heart_data, 'target')

## Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)
evaluate(model, X_test, y_test)

In [None]:
pred = model.predict(X_test)
false_positives = 0
false_negatives = 0
for prediction, truth in zip(pred, y_test):
    if truth == 1 and prediction == 0:
        false_negatives += 1
    if truth == 0 and prediction == 1:
        false_positives += 1

print(f"False Positives: {false_positives}")
print(f"False negatives {false_negatives}")

In [None]:
from sklearn.metrics import roc_curve, plot_roc_curve
roc_curve(y_test, model.predict_proba(X_test)[:,1])
plot_roc_curve(model, X_test, y_test)

# Classification (Crop Recommendation)

## Data Preparation

In [None]:
crop_data = pd.read_csv('crops.csv')
crop_data

In [None]:
X_train, X_test, y_train, y_test = prepare_data(crop_data, 'label')

## Naive Bayes Model

In [None]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train)
evaluate(model, X_test, y_test)

## Sochastic Gradient Descent Model

In [None]:
from sklearn.linear_model import SGDClassifier
model = SGDClassifier()
model.fit(X_train, y_train)
evaluate(model, X_test, y_test)

## Perceptron Model

In [None]:
from sklearn.linear_model import Perceptron
model = Perceptron()
model.fit(X_train, y_train)
evaluate(model, X_test, y_test)

## Decision Tree Model

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
evaluate(model, X_test, y_test)

## XGBoost Model

In [None]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
evaluate(model, X_test, y_test)

# Regression

In [None]:
house_data = pd.read_csv('house_prices.csv')
house_data

## Categorical Encoding

In [None]:
house_data = house_data.drop(['date', 'country'], axis=1)
house_data['street'] = house_data['street'].apply(lambda x: ' '.join(x.split(' ')[1:]))

from sklearn.preprocessing import LabelEncoder
street_encoder = LabelEncoder()
house_data['street'] = street_encoder.fit_transform(house_data['street'])

city_encoder = LabelEncoder()
house_data['city'] = city_encoder.fit_transform(house_data['city'])

zip_encoder = LabelEncoder()
house_data['statezip'] = zip_encoder.fit_transform(house_data['statezip'])

house_data


## Split Dataset

In [None]:
X_train, X_test, y_train, y_test = prepare_data(house_data, 'price')

## Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
model = LinearRegression()
model.fit(X_train, y_train)
pred = model.predict(X_test)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)
rmse = mean_squared_error(y_test, pred, squared=False)

print(f"Mean absolute error: {mae}")
print(f"Mean squared error: {mse}")
print(f"Root mean squared error: {rmse}")

# Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    "penalty": ['l1', 'l2', 'elasticnet'],
    "alpha": [0.0001, 0.001, 0.01],
    "eta0": [0.001, 0.01, 0.1],
    "learning_rate": ['constant', 'adaptive']
}
grid_cv = GridSearchCV(SGDClassifier(), param_grid, n_jobs=-1, cv=5, scoring="f1_weighted")

In [None]:
grid_cv.fit(X_train, y_train)

In [None]:
grid_cv.best_score_

In [None]:
grid_cv.best_params_

In [None]:
model = grid_cv.best_estimator_

In [None]:
evaluate(model, X_test, y_test)

# Feature Scaling

## Data Prep

In [None]:
X_train, X_test, y_train, y_test = prepare_data(crop_data, 'label')

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

X = crop_data.drop("label", axis=1)
y = crop_data["label"]

standard_scaler = StandardScaler()
standard_scaler.fit(X)
X_s_scaled = pd.DataFrame(standard_scaler.transform(X), columns=X.columns)

minmax_scaler = MinMaxScaler()
minmax_scaler.fit(X)
X_mm_scaled = pd.DataFrame(minmax_scaler.transform(X), columns=X.columns)
with pd.option_context('display.float_format', lambda x: '%.3f' % x):  
    print("Unscaled Data:") 
    display(X.describe())
    print("Standardized Data:")
    display(X_s_scaled.describe())
    print("Normalized Data:")
    display(X_mm_scaled.describe())

## Unscaled Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = SGDClassifier()
model.fit(X_train, y_train)
evaluate(model, X_test, y_test)

## Standardized Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_s_scaled, y, test_size=0.2, random_state=42)
model = SGDClassifier()
model.fit(X_train, y_train)
evaluate(model, X_test, y_test)

## Normalized Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_mm_scaled, y, test_size=0.2, random_state=42)
model = SGDClassifier()
model.fit(X_train, y_train)
evaluate(model, X_test, y_test)

# SHAP Values

In [None]:
import shap

In [None]:
X_train, X_test, y_train, y_test = prepare_data(crop_data, 'label')

In [None]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
evaluate(model, X_test, y_test)

In [None]:
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X_train)