## Preliminary code

In [None]:
import pandas as pd

In [None]:
pip install tldextract

## Load non processed file

In [None]:
from google.colab import drive
drive.mount('/content/drive')

data = pd.read_csv('/content/drive/MyDrive/Machine Learning/Google-Playstore_cleaned_V3.csv', encoding='utf-8', encoding_errors='replace')


## Load processed file

In [None]:
from google.colab import drive
drive.mount('/content/drive')

data = pd.read_csv('/content/drive/MyDrive/Machine Learning/Google-Playstore_preprocessed.csv', encoding='utf-8', encoding_errors='replace')


Mounted at /content/drive


In [None]:
from sklearn.model_selection import train_test_split

# Split your dataset into training and testing sets using stratified sampling
sample_size = 1000000  # Specify the desired sample size
X_sample, _, y_sample, _ = train_test_split(data, data['Minimum Installs'], train_size=sample_size, stratify=data['Minimum Installs'], random_state=42)
sampled_data = pd.DataFrame(data=X_sample, columns=data.columns)  # Assuming your data is in a DataFrame
sampled_data.to_csv('/content/drive/MyDrive/Machine Learning/sampled_data_1000000.csv', index=False)

## Preprocess

### General

In [None]:
# Drop rows with NA values
data.dropna(inplace=True, axis=0)

# Drop rows that are not in 'USD' and 'Currency' column
column='Currency'
data = data[data[column].isin(['USD'])]
data.drop(inplace=True, columns=column, axis=1)

# Drop 'Installs' and 'Maximum Installs' columns
# data.drop(inplace=True, columns=['Installs', 'Maximum Installs'], axis=1)

# Keep rows with 'Minimum Android' value containing 'and up' pattern
data = data[data['Minimum Android'].str.contains('and up')]

NameError: ignored

### Dates

In [None]:
updated ='Updated'
last_updated='Last Updated'
released='Released'

# Turn 'Last updated' string into 'Updated' boolean

data[updated]                                           = False
data.loc[data[last_updated] == data[released], updated] = True

data.drop(inplace=True, columns=last_updated, axis=1)

# Turn 'Released' column into 'Month', 'Year' and 'MonthYear' columns
#released_split=data[released].str.split(expand=True)

#data['Month']     = released_split[0]
#data['Year']      = released_split[2]
#data['MonthYear'] = released_split[0] + released_split[2]

#data.drop(inplace=True, columns=released, axis=1)

### URLs

In [None]:
# developer email => TLD + SLD (e.g.: com+ gmail)
#	developer website => TLD + SLD (e.g.: io + github)
import tldextract

def extract_domains(elem):
  if "@" in elem:
      email_domain = elem.split('@', 1)[1]
      extracted = tldextract.extract(email_domain)
  else:
    extracted = tldextract.extract(elem)
  return extracted.domain, extracted.suffix

data[['TLD', 'SLD']] = data['Developer Website'].apply(lambda elem: pd.Series(extract_domains(elem)))
data[['mailTLD', 'mailSLD']] = data['Developer Email'].apply(lambda elem: pd.Series(extract_domains(elem)))

In [None]:
data.drop(inplace=True, columns=['Developer Website', 'Developer Email'], axis=1)

### Strings

In [None]:
def total_length(text):
    return len(text)

data['TotalLength'] = data['App Name'].apply(total_length)
data.drop('App Name', axis = 1, inplace = True)
data.drop('App Id', axis = 1, inplace = True)

### Factorization

In [None]:
data['Category'] = pd.factorize(data['Category'])[0]
data['Content Rating'] = pd.factorize(data['Content Rating'])[0]
data['Minimum Android'] = pd.factorize(data['Minimum Android'])[0]
data['Size'] = pd.factorize(data['Size'])[0]
data['Released_Month'] = pd.factorize(data['Released_Month'])[0]
data['Released_Year'] = pd.factorize(data['Released_Year'])[0]
data['Released'] = pd.factorize(data['Released'])[0]
data['Rating Count'] = pd.factorize(data['Rating Count'])[0]
data['TLD'] = pd.factorize(data['TLD'])[0]
data['SLD'] = pd.factorize(data['SLD'])[0]
data['mailTLD'] = pd.factorize(data['mailTLD'])[0]
data['mailSLD'] = pd.factorize(data['mailSLD'])[0]

In [None]:
print(data.head)

In [None]:
data.to_csv('/content/drive/MyDrive/Machine Learning/Google-Playstore_preprocessed.csv', index=False)

## Dummy Test

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

train_data = data.sample(frac=0.80)
test_data = data.drop(train_data.index)
val_data = test_data.sample(frac=0.5)
test_data = test_data.drop(val_data.index)

train_label = train_data['Minimum Installs']
val_label = val_data['Minimum Installs']
test_label = test_data['Minimum Installs']

dummy_classifier = DummyClassifier(strategy="most_frequent")
dummy_classifier.fit(train_data, train_label)

y_pred = dummy_classifier.predict(test_data)
accuracy = accuracy_score(test_label, y_pred)

print(f"Accuracy: {accuracy:.5f}")

Accuracy: 0.19304






## CART

### No parameter tuning

In [None]:
from numpy                   import median
from sklearn                 import tree
from sklearn.model_selection import train_test_split


# Split into training and test data
train_data, test_data, train_targets, test_targets = train_test_split(data.drop('Minimum Installs', axis=1), data['Minimum Installs'],test_size=0.2, stratify=data['Minimum Installs'])
score_train = 0
score_test  = 0
trials = 3

score_list_train = []
score_list_test  = []

clf = tree.DecisionTreeClassifier()

for i in range(0, trials):
    clf = clf.fit(train_data, train_targets)
    score_list_train.append(clf.score(train_data, train_targets))
    score_list_test.append(clf.score(test_data, test_targets))
    score_train += clf.score(train_data, train_targets)
    score_test  += clf.score(test_data, test_targets)
    print(str(i) + " ", end='')


0 1 2 

In [None]:
print('train avg ' + str(median(score_list_train)) + 'train med ' + str(score_train/trials) +
      'test avg ' + str(median(score_list_test)) + 'test med ' + str(score_test/trials))

train avg 0.9808795702992147train med 0.9808795702992147test avg 0.33840631827495893test med 0.3385861231357968


### With Tuning

In [None]:
from sklearn                 import tree
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

# Split into training and test data
train_data, test_data, train_targets, test_targets = train_test_split(data.drop('Minimum Installs', axis=1), data['Minimum Installs'],test_size=0.2, stratify=data['Minimum Installs'])



With this test we find that 13 as max_depth is the best choice

In [None]:
score_train = 0
score_test  = 0
trials = 10

#13 best depth
for i in range(0, trials):
  clf = tree.DecisionTreeClassifier(max_depth = 10+i)
  clf = clf.fit(train_data, train_targets)
  score_train = clf.score(train_data, train_targets)
  score_test  = clf.score(test_data, test_targets)
  print(str(i + 10) + " score train: " + str(score_train) + "\tscore_test: " + str(score_test))


10 score train: 0.44947173706355864	score_test: 0.4458176710860214
11 score train: 0.45412038305317914	score_test: 0.447720888575267
12 score train: 0.46018626813213653	score_test: 0.44883364318573504
13 score train: 0.4674166470888077	score_test: 0.44856223962220626
14 score train: 0.4770455611410174	score_test: 0.4478973008915607
15 score train: 0.4894572169359647	score_test: 0.44615692554043235
16 score train: 0.5048661934620412	score_test: 0.4429781113026014
17 score train: 0.5243725260866136	score_test: 0.43824890420811224
18 score train: 0.548044914011499	score_test: 0.4316979006934361
19 score train: 0.5755067838341449	score_test: 0.4246210527744229


Try with RandomizedSearchCV

In [None]:
clf = tree.DecisionTreeClassifier()

# Define the hyperparameter distributions
param_dist = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 13, 15, 20],  # None for no maximum depth
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a RandomizedSearchCV object
random_search = RandomizedSearchCV(
    clf,
    param_distributions=param_dist,
    n_iter=10,    # Number of random parameter combinations to try
    cv=5,         # Number of cross-validation folds
    random_state=42
)

# Perform Randomized Hyperparameter Search
random_search.fit(train_data, train_targets)

# Find the Best Parameters
best_params = random_search.best_params_
best_clf = random_search.best_estimator_

# Evaluate the Model on the Test Data
accuracy = best_clf.score(test_data, test_targets)

print("Best Hyperparameters:", best_params)
print("Accuracy on Test Data:", accuracy)

Best Hyperparameters: {'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 13, 'criterion': 'entropy'}
Accuracy on Test Data: 0.44982765873715924


In [None]:
train_accuracy = best_clf.score(train_data, train_targets)
print("Train Accuracy:", train_accuracy)

Train Accuracy: 0.4625398518980911


Try with GridSearchCV

In [None]:
clf = tree.DecisionTreeClassifier()

# Define the hyperparameter distributions
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 13, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(train_data, train_targets)

# Find the Best Parameters
best_params = grid_search.best_params_
best_clf = grid_search.best_estimator_

# Evaluate the Model on the Test Data
accuracy = best_clf.score(test_data, test_targets)

print("Best Hyperparameters:", best_params)
print("Accuracy on Test Data:", accuracy)

In [None]:
train_accuracy = best_clf.score(train_data, train_targets)
print("Train Accuracy:", train_accuracy)

A more sophisticated approach with RandomizedSearchCV

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# Split the data into training (80%), validation (10%), and test (10%) sets
X_train, X_temp, y_train, y_temp = train_test_split(
  data.drop('Minimum Installs', axis=1),
  data['Minimum Installs'],
  test_size=0.2,
  random_state=42,
  stratify=data['Minimum Installs']  # Stratified split for class balance
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.5,
    random_state=42,
    stratify=y_temp  # Stratified split for class balance
)

# 1. Data Preprocessing
# Standardize features (useful for some models, not needed for Decision Trees)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# 2. Feature Selection
# Use SelectKBest with ANOVA F-statistic for feature selection (adjust k as needed)
selector = SelectKBest(score_func=f_classif, k=10)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_val_selected = selector.transform(X_val_scaled)
X_test_selected = selector.transform(X_test_scaled)

# 3. Decision Tree Model with Cross-Validation
clf = DecisionTreeClassifier(random_state=42)

# Perform 5-fold cross-validation on the training data
cv_scores = cross_val_score(clf, X_train_selected, y_train, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42))
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))

# 4. Hyperparameter Tuning (RandomizedSearchCV)
param_dist = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [13],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

random_search = RandomizedSearchCV(
    clf,
    param_distributions=param_dist,
    n_iter=10,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    random_state=42,
    n_jobs=-1  # Use all available CPU cores
)

random_search.fit(X_train_selected, y_train)

print("Best Hyperparameters:", random_search.best_params_)

# 5. Final Model Evaluation on Test Data
best_clf = random_search.best_estimator_
y_pred = best_clf.predict(X_test_selected)
test_accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", test_accuracy)

Cross-Validation Scores: [0.32373808 0.32639699 0.32522656 0.32532971 0.32472329]
Mean CV Score: 0.3250829266259918
Best Hyperparameters: {'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 13, 'criterion': 'entropy'}
Test Accuracy: 0.44179750580125116


## Random Forest

In [None]:
#!/usr/bin/python3

from pandas                  import read_csv
from sklearn                 import tree
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble        import RandomForestClassifier
from sklearn.metrics         import accuracy_score

#PUT THE RIGHT DATASET
#data = read_csv("./preprocessed.csv")

# Split the data into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(data.drop('Minimum Installs', axis=1),
                                                 data['Minimum Installs'],
                                                 test_size=0.2,
                                                 random_state=42,
                                                 stratify=data['Minimum Installs']  # Stratified split for class balance
                                                 )

# Create a Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42, verbose=1, n_jobs=-1)

# Define a parameter grid for Grid Search
param_grid = {
    'max_depth': [3, 4, 5, 6, 10, 12, 15],
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best max_depth from Grid Search
best_max_depth = grid_search.best_params_['max_depth']
print("Best max_depth:", best_max_depth)

rf_model = RandomForestClassifier(max_depth=best_max_depth, random_state=42, verbose=1, n_jobs=-1)

#Retrain Again
param_grid = {
    'n_estimators': [30, 50, 100, 500],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(max estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters from Grid Search
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Create a Random Forest model with the best parameters
best_rf_model = RandomForestClassifier(random_state=42, **best_params, verbose=1, n_jobs=-1)

# Train the model with the best parameters on the entire training set
best_rf_model.fit(X_train, y_train)

# Test
y_pred = best_rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Train
y_train_pred = best_rf_model.predict(X_train)
test_accuracy = accuracy_score(y_train, y_train_pred)
print("Train accuracy:", train_accuracy)


# XGBoost

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc


# Function to determine the value mapping
def determine_mapping(values):
    unique_values = sorted(smaller_dataset['Minimum Installs'].unique())
    mapping = {val: idx for idx, val in enumerate(unique_values)}
    return mapping

# Get the mapping dictionary
value_mapping = determine_mapping(smaller_dataset['Minimum Installs'])

# Use the replace method to map the values in the 'values' column
smaller_dataset['Minimum Installs'] = smaller_dataset['Minimum Installs'].replace(value_mapping)
unique_values = smaller_dataset['Minimum Installs'].unique()
sorted_unique_values = sorted(unique_values)

# Define the target variable
target = 'Minimum Installs'

# List of predictors (exclude the target column)
predictors = [col for col in smaller_dataset.columns if col != 'Minimum Installs']


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(smaller_dataset[predictors],
                                                    smaller_dataset['Minimum Installs'],
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=smaller_dataset['Minimum Installs'])

# Handle missing values in y_train (choose one of the methods mentioned above)
# For example, removing rows with missing values:
X_train = X_train[~y_train.isnull()]
y_train = y_train.dropna()

# Define the XGBoost Classifier
xgb_classifier = XGBClassifier(
    learning_rate =0.2,
    n_estimators=70,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    device = "cuda",
    nthread=64,
    seed=27
)

# Define a parameter grid for Grid Search
param_grid = {
    'max_depth':[3,4,5,6],
    'min_child_weight':[4,5,6,8,10,12]
}

# Create a Grid Search with cross-validation
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, scoring='accuracy', cv=5, verbose=2)

# Fit the Grid Search to your training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params_1 = grid_search.best_params_
best_score_1 = grid_search.best_score_


print("Best Parameters:", best_params_1)
print("Best Accuracy Score:", best_score_1)

# Define the XGBoost Classifier
xgb_classifier = XGBClassifier(
    learning_rate=0.2,
    n_estimators=70,
    max_depth=best_params_1['max_depth'],
    min_child_weight=best_params_1['min_child_weight'],
    device = "cuda",
    subsample=0.8,
    colsample_bytree=0.8,
    nthread=64,
    seed=27
)

# Define a parameter grid for Grid Search
param_grid = {
    'gamma':[i/10.0 for i in range(0,5)]
}

# Create a Grid Search with cross-validation
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, scoring='accuracy', cv=5, verbose=2)

# Fit the Grid Search to your training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params_2 = grid_search.best_params_
best_score_2 = grid_search.best_score_

print("Best Parameters:", best_params_2)
print("Best Accuracy Score:", best_score_2)

# Define the XGBoost Classifier
xgb_classifier = XGBClassifier(
    learning_rate =0.1,
    n_estimators=100,
    max_depth=best_params_1['max_depth'],
    min_child_weight=best_params_1['min_child_weight'],
    gamma=best_params_2['gamma'],
    device = "cuda",
    subsample=0.8,
    colsample_bytree=0.8,
    nthread=64,
    seed=27
)

# Define a parameter grid for Grid Search
param_grid = {
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)]
}

# Create a Grid Search with cross-validation
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, scoring='accuracy', cv=5, verbose=2)

# Fit the Grid Search to your training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params_3 = grid_search.best_params_
best_score_3 = grid_search.best_score_

print("Best Parameters:", best_params_3)
print("Best Accuracy Score:", best_score_3)

# Define the XGBoost Classifier
xgb_classifier = XGBClassifier(
    learning_rate =0.1,
    n_estimators=100,
    max_depth=best_params_1['max_depth'],
    min_child_weight=best_params_1['min_child_weight'],
    gamma=best_params_2['gamma'],
    subsample=best_params_3['subsample'],
    colsample_bytree=best_params_3['colsample_bytree'],
    device = "cuda",
    nthread=64,
    seed=27
)

# Define a parameter grid for Grid Search
param_grid = {
    'reg_alpha':[1e-5, 1e-2,0.05, 0.1, 0.5, 1, 100]
}

# Create a Grid Search with cross-validation
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, scoring='accuracy', cv=5, verbose=2)

# Fit the Grid Search to your training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params_4 = grid_search.best_params_
best_score_4 = grid_search.best_score_


print("Best Parameters:", best_params_4)
print("Best Accuracy Score:", best_score_4)


# Function to determine the value mapping
def determine_mapping(values):
    unique_values = sorted(dataset['Minimum Installs'].unique())
    mapping = {val: idx for idx, val in enumerate(unique_values)}
    return mapping

# Get the mapping dictionary
value_mapping = determine_mapping(dataset['Minimum Installs'])

# Use the replace method to map the values in the 'values' column
dataset['Minimum Installs'] = dataset['Minimum Installs'].replace(value_mapping)
unique_values = dataset['Minimum Installs'].unique()
sorted_unique_values = sorted(unique_values)

# Define the target variable
target = 'Minimum Installs'

# List of predictors (exclude the target column)
predictors = [col for col in dataset.columns if col != 'Minimum Installs']


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset[predictors],
                                                    dataset['Minimum Installs'],
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=dataset['Minimum Installs'])


# Train the final model with the best parameters
final_xgb_model = XGBClassifier(
    n_estimators=1000,
    max_depth=best_params_1['max_depth'],
    min_child_weight=best_params_1['min_child_weight'],
    gamma=best_params_2['gamma'],
    subsample=best_params_3['subsample'],
    colsample_bytree=best_params_3['colsample_bytree'],
    reg_alpha=best_params_4['reg_alpha'],
    device = "cuda",
    learning_rate=0.1,
    nthread=64,
    seed=27
)
final_xgb_model.fit(X_train, y_train)

# Evaluate the final model on the test data
y_pred = final_xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on Test Data:", accuracy)

# Train the final model with the best parameters
final_final_xgb_model = XGBClassifier(
    n_estimators=5000,
    max_depth=best_params_1['max_depth'],
    min_child_weight=best_params_1['min_child_weight'],
    gamma=best_params_2['gamma'],
    subsample=best_params_3['subsample'],
    colsample_bytree=best_params_3['colsample_bytree'],
    reg_alpha=best_params_4['reg_alpha'],
    device = "cuda",
    learning_rate=0.01,
    nthread=64,
    seed=27
)
final_final_xgb_model.fit(X_train, y_train)

# Evaluate the final model on the test data
y_pred = final_final_xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on Test Data:", accuracy)

# Confusion Matrix
confusion_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", confusion_mat)

# Classification Report
class_report = classification_report(y_test, y_pred, output_dict=True)
print("Classification Report:\n", class_report)

# ROC Curve and AUC
y_probs = final_final_xgb_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
roc_auc = auc(fpr, tpr)

print("False Positive Rate (FPR):", fpr)
print("True Positive Rate (TPR):", tpr)
print("Thresholds:", thresholds)
print("Area Under the Curve (AUC):", roc_auc)

In [None]:
# Train the final model with the best parameters
final_final_xgb_model = XGBClassifier(
    n_estimators=5000,
    max_depth=best_params_1['max_depth'],
    min_child_weight=best_params_1['min_child_weight'],
    gamma=best_params_2['gamma'],
    subsample=best_params_3['subsample'],
    colsample_bytree=best_params_3['colsample_bytree'],
    reg_alpha=best_params_4['reg_alpha'],
    learning_rate=0.01,
    device=cuda:0
    nthread=64,
    seed=27
)
final_final_xgb_model.fit(X_train, y_train)

# Evaluate the final model on the test data
y_pred = final_final_xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on Test Data:", accuracy)

# Confusion Matrix
confusion_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", confusion_mat)

# Classification Report
class_report = classification_report(y_test, y_pred, output_dict=True)
print("Classification Report:\n", class_report)

# ROC Curve and AUC
y_probs = final_final_xgb_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
roc_auc = auc(fpr, tpr)

print("False Positive Rate (FPR):", fpr)
print("True Positive Rate (TPR):", tpr)
print("Thresholds:", thresholds)
print("Area Under the Curve (AUC):", roc_auc)

/cluster/datastore/paoloc/.local/lib/python3.8/site-packages/xgboost/core.py:160: UserWarning: [16:34:31] WARNING: /workspace/src/common/error_msg.cc:45: `gpu_id` is deprecated since2.0.0, use `device` instead. E.g. device=cpu/cuda/cuda:0
  warnings.warn(smsg, UserWarning)




  /cluster/datastore/paoloc/.local/lib/python3.8/site-packages/xgboost/core.py:160: UserWarning: [16:34:31] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.

    E.g. tree_method = "hist", device = "cuda"

  warnings.warn(smsg, UserWarning)
/cluster/datastore/paoloc/.local/lib/python3.8/site-packages/xgboost/core.py:160: UserWarning: [16:34:37] WARNING: /workspace/src/common/error_msg.cc:27: The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` parameter to CUDA instead.

    E.g. tree_method = "hist", device = "cuda"

  warnings.warn(smsg, UserWarning)
/cluster/datastore/paoloc/.local/lib/python3.8/site-packages/xgboost/core.py:160: UserWarning: [16:34:37] WARNING: /workspace/src/common/error_msg.cc:58: Falling back to prediction using DMatrix due to mismatched devices. This might lead to higher memory usage and slower performance. XGBoost is running on: cuda:0, while the input data is on: cpu.
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.

This warning will only be shown once.