In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 1. Read the data
# Assume the data is stored in the file "atm_data.csv"
data = pd.read_csv("atm_transactions.csv")

# 2. Data preprocessing
# Remove columns that are not used for prediction or do not contain numerical values
data.drop(['atmId', 'atmName', 'atmCity', 'atmAddress', 'transactionTime'], axis=1, inplace=True)

# Convert the day column to numerical format (Monday=0, Sunday=6)
data['day'] = pd.Categorical(data['day'], categories=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], ordered=True)
data['day'] = data['day'].cat.codes

# Define the input variables (X) and the target variable (y)
X = data.drop(['totalBalance'], axis=1)  # Input variables
y = data['totalBalance']                # Target variable

# 3. Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Build the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 5. Make predictions on the test set
y_pred = model.predict(X_test)

# 6. Evaluate performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

# 7. Predict ATMs that are running low on cash (e.g., threshold < 10,000)
threshold = 10000
atm_test = X_test.copy()
atm_test['Predicted_Balance'] = y_pred
atm_test['Low_Balance'] = atm_test['Predicted_Balance'] < threshold

print("List of ATMs running low on cash:")
print(atm_test[atm_test['Low_Balance']])

Mean Squared Error (MSE): 133631624.73918009
R-squared (R2): 0.10244115015675015
Danh sách ATM sắp hết tiền:
       numberIncomeTransaction  numberOutcomeTransaction  totalIncome  \
26811                        3                        18           65   
15922                       56                        99         3755   
12320                       40                        94         2465   
12574                       30                        63         1870   
10772                       63                        97         4350   
27090                        0                        15            0   
12022                       55                        66         3615   
12893                       46                        54         2395   

       totalOutcome  totalNumberTransaction  day  Predicted_Balance  \
26811          1485                      21    1            9300.45   
15922          6740                     155    5            9670.20   
12320          6595 

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import randint

# 1. Load data
data = pd.read_csv("atm_transactions.csv")

# 2. Data preprocessing
data.drop(['atmId', 'atmName', 'atmCity', 'atmAddress', 'transactionTime'], axis=1, inplace=True)

# Convert the day to numeric form (Monday=0, Sunday=6)
data['day'] = pd.Categorical(data['day'], categories=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], ordered=True)
data['day'] = data['day'].cat.codes

# Define input features (X) and target variable (y)
X = data.drop(['totalBalance'], axis=1)  # Input features
y = data['totalBalance']                # Target variable

# 3. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Build the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize data
    ('rf', RandomForestRegressor(random_state=42))  # Random Forest model
])

# 5. Configure RandomizedSearchCV to find the best parameters
param_dist = {
    'rf__n_estimators': [50, 100, 150, 200],
    'rf__max_depth': [10, 20, None],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__max_features': ['sqrt', 'log2', None],
    'rf__bootstrap': [True, False]
}
# Perform randomized search
random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1)

# 6. Train the model with randomized parameter search
random_search.fit(X_train, y_train)

# 7. Retrieve the best model from RandomizedSearchCV
best_model = random_search.best_estimator_

# 8. Make predictions on the test set
y_pred = best_model.predict(X_test)

# 9. Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Best Parameters: {random_search.best_params_}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")

# 10. Evaluate the model using cross-validation
cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='neg_mean_squared_error')
print(f"Cross-validation MSE scores: {-cv_scores}")
print(f"Mean Cross-validation MSE: {-cv_scores.mean()}")


Best Parameters: {'rf__n_estimators': 50, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1, 'rf__max_features': None, 'rf__max_depth': 10, 'rf__bootstrap': True}
Mean Squared Error (MSE): 123604942.72742197
Mean Absolute Error (MAE): 9119.729251099925
Root Mean Squared Error (RMSE): 11117.775979368444
R-squared (R2): 0.16978701377086658
Cross-validation MSE scores: [1.71106214e+08 1.23297643e+08 1.59705314e+08 1.53594879e+08
 1.41518465e+08]
Mean Cross-validation MSE: 149844503.1192026
