In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler

# 1. Read the data (replace `data.csv` with your file)
df = pd.read_csv("train_cleaned.csv")

# 2. Data preprocessing
# Convert date columns
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['dob'] = pd.to_datetime(df['dob'])

# Create an age column from the birthdate
df['age'] = (df['trans_date_trans_time'].dt.year - df['dob'].dt.year)

# Drop unnecessary or difficult-to-use columns
columns_to_drop = ['trans_date_trans_time', 'cc_num', 'first', 'last', 
                   'street', 'city', 'state', 'zip', 'lat', 'long', 'dob', 
                   'trans_num', 'unix_time', 'merchant']
df = df.drop(columns=columns_to_drop)

# Encode categorical variables
label_encoders = {}
for col in ['category', 'gender', 'job']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Standardize numerical variables
scaler = StandardScaler()
scaled_columns = ['amt', 'city_pop', 'merch_lat', 'merch_long', 'age']
df[scaled_columns] = scaler.fit_transform(df[scaled_columns])

# 3. Split the data into training and test sets
X = df.drop(columns=['is_fraud'])
y = df['is_fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# 4. Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
rf_model.fit(X_train, y_train)

# 5. Evaluate the model
y_pred = rf_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# 6. Observe feature importances
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importances:\n", feature_importances)

Accuracy: 0.9974396084348964
Confusion Matrix:
 [[386538    213]
 [   783   1469]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    386751
           1       0.87      0.65      0.75      2252

    accuracy                           1.00    389003
   macro avg       0.94      0.83      0.87    389003
weighted avg       1.00      1.00      1.00    389003


Feature Importances:
       Feature  Importance
1         amt    0.709995
0    category    0.125265
5   merch_lat    0.034422
6  merch_long    0.034144
7         age    0.033991
3    city_pop    0.031271
4         job    0.024700
2      gender    0.006213


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from scipy.stats import randint
import joblib

# 1. Read the data (replace `data.csv` with your file)
df = pd.read_csv("train_cleaned.csv")

# 2. Process the data
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['dob'] = pd.to_datetime(df['dob'])

df['age'] = (df['trans_date_trans_time'].dt.year - df['dob'].dt.year)

# Remove unnecessary columns
columns_to_drop = ['trans_date_trans_time', 'cc_num', 'first', 'last', 
                   'street', 'city', 'state', 'zip', 'lat', 'long', 'dob', 
                   'trans_num', 'unix_time', 'merchant']
df = df.drop(columns=columns_to_drop)

# Split the data into training and testing sets
X = df.drop(columns=['is_fraud'])
y = df['is_fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# 3. Create steps in the Pipeline
categorical_columns = ['category', 'gender', 'job']
numerical_columns = ['amt', 'city_pop', 'merch_lat', 'merch_long', 'age']

# Preprocessing for categorical and numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),  # Scale numerical columns
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)  # One-hot encode categorical columns
    ]
)

# 4. Build Pipeline with preprocessing step and RandomForestClassifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Apply preprocessing to the data
    ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced', n_jobs=-1))  # Random Forest model
])

# 5. Hyperparameter search using RandomizedSearchCV
param_dist = {
    'classifier__n_estimators': randint(50, 100),  # Reduce the number of trees from 50 to 100
    'classifier__max_depth': randint(5, 15),  # Reduce the maximum depth of the trees
    'classifier__min_samples_split': randint(2, 10),  # Reduce the minimum number of samples to split
    'classifier__min_samples_leaf': randint(1, 10)  # Reduce the minimum number of samples at a leaf
}

# Reduce the number of parameter trials and splits (n_iter=5, cv=3)
random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, 
                                   n_iter=5, cv=3, random_state=42, n_jobs=-1, verbose=1)
random_search.fit(X_train, y_train)

# 6. Evaluate the model
y_pred = random_search.predict(X_test)

# Print the results
print("Best parameters found: ", random_search.best_params_)
print("\nAccuracy: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# 7. Cross-validation
cv_scores = cross_val_score(random_search.best_estimator_, X, y, cv=3, scoring='accuracy')  # Use 3-fold cross-validation
print("\nCross-validation accuracy scores:", cv_scores)
print("Mean cross-validation accuracy: ", np.mean(cv_scores))

# 8. Feature Selection
# Apply SelectFromModel on the pipeline after training
selector = SelectFromModel(random_search.best_estimator_.named_steps['classifier'], threshold="mean", max_features=10)

# Pass the preprocessed data into the selector to select important features
X_train_selected = random_search.best_estimator_.named_steps['preprocessor'].transform(X_train)
X_train_selected = selector.transform(X_train_selected)

# 9. Save the trained model with joblib to save time for next use
joblib.dump(random_search.best_estimator_, 'random_forest_model.pkl')

# 10. Observe feature importances
best_rf_model = random_search.best_estimator_.named_steps['classifier']
feature_importances = pd.DataFrame({
    'Feature': numerical_columns + list(random_search.best_estimator_.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_columns)),
    'Importance': best_rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importances:\n", feature_importances)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best parameters found:  {'classifier__max_depth': 9, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 9, 'classifier__n_estimators': 93}

Accuracy:  0.9722315766202317
Confusion Matrix:
 [[376474  10277]
 [   525   1727]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.99    386751
           1       0.14      0.77      0.24      2252

    accuracy                           0.97    389003
   macro avg       0.57      0.87      0.61    389003
weighted avg       0.99      0.97      0.98    389003


Cross-validation accuracy scores: [0.97100584 0.97182255 0.97553358]
Mean cross-validation accuracy:  0.9727873214182429

Feature Importances:
                                  Feature  Importance
0                                    amt    0.480338
16                 category_shopping_net    0.076623
9                   category_grocery_pos    