In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load data
df = pd.read_csv('customer_data.csv')

# Preprocessing and Feature Engineering
# Clean data
df = df.dropna()

# Encode categorical variables
df = pd.get_dummies(df, columns=['gender', 'education'])

# Scale numerical features
scaler = StandardScaler()
numerical_cols = ['age', 'income', 'website_activity']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('buy', axis=1), df['buy'], test_size=0.2, random_state=42)

# Train and Evaluate Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, max_depth=5, max_features='sqrt', random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.3f}')
('Just for educational purposes, the dataset is really small, that is why we are getting 1.00 accuracy')

Accuracy: 1.000


'Just for educational purposes, the dataset is really small, that is why we are getting 1.00 accuracy'

In [None]:
# Feature Selection with Random Forest
importance = rf.feature_importances_
features = X_train.columns
feat_imp = pd.DataFrame({'Feature': features, 'Importance': importance})
feat_imp = feat_imp.sort_values('Importance', ascending=False)
print(feat_imp)

# Grid Search with Cross-Validation to find best hyperparameters
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'max_features': ['sqrt', 'log2']
}

rf_cv = RandomForestClassifier(random_state=42)
#GridSearchCV is being used to tune the hyperparameters of a random forest classifier
#param_grid parameter specifies the hyperparameters and their possible values to be tried
#cv parameter specifies the number of cross-validation folds to use during the search.
grid_search = GridSearchCV(rf_cv, param_grid=params, cv=5)
grid_search.fit(X_train, y_train)
print('')
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Score: {grid_search.best_score_:.3f}')

                 Feature  Importance
1                 income    0.425609
0                    age    0.335373
2       website_activity    0.157084
3               gender_F    0.024002
8          education_PhD    0.016209
5   education_Bachelor's    0.012998
4               gender_M    0.011628
6  education_High School    0.011248
7     education_Master's    0.005850

Best Parameters: {'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 50}
Best Score: 0.971
