## Lab | Random Forests

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
categorical = pd.read_csv('categorical.csv')
numerical = pd.read_csv('numerical.csv')
target = pd.read_csv('target.csv')

In [None]:
display(categorical)

In [None]:
display(numerical)

In [None]:
display(target)

In [None]:
data = pd.concat([categorical, numerical, target], axis=1)

In [None]:
data

In [None]:
target_counts = data['TARGET_B'].value_counts()

print("Class Distribution:")
print(target_counts)

In [None]:
# Separate categorical and numerical columns
categorical_cols = ['STATE', 'HOMEOWNR', 'GENDER', 'DATASRCE', 'RFA_2R', 'RFA_2A', 'GEOCODE2', 'DOMAIN_A', 'DOMAIN_B']
numerical_cols = [col for col in data.columns if col not in categorical_cols and col != 'TARGET_B']

encoded_data = pd.get_dummies(data, columns=categorical_cols)

X = encoded_data.drop('TARGET_B', axis=1)
y = encoded_data['TARGET_B']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#identifying numericals
numeric_cols = ['MAXRAMNT', 'LASTGIFT', 'TIMELAG', 'AVGGIFT', 'CONTROLN']
data[numeric_cols] = data[numeric_cols].apply(pd.to_numeric, errors='coerce')

encoded_data = pd.get_dummies(data, columns=categorical_cols)


In [None]:
# Separate categorical and numerical columns
categorical_cols = ['STATE', 'HOMEOWNR', 'GENDER', 'DATASRCE', 'RFA_2R', 'RFA_2A', 'GEOCODE2', 'DOMAIN_A', 'DOMAIN_B']
numerical_cols = [col for col in data.columns if col not in categorical_cols and col != 'TARGET_B']

data[numerical_cols] = data[numerical_cols].apply(pd.to_numeric, errors='coerce')

encoded_data = pd.get_dummies(data, columns=categorical_cols)

In [None]:
from sklearn.utils import resample

X = encoded_data.drop('TARGET_B', axis=1)
y = encoded_data['TARGET_B']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_data = pd.concat([X_train, y_train], axis=1)

majority_class = train_data[train_data['TARGET_B'] == 0]
minority_class = train_data[train_data['TARGET_B'] == 1]

minority_upsampled = resample(minority_class,
                              replace=True,
                              n_samples=len(majority_class),
                              random_state=42)

upsampled_data = pd.concat([majority_class, minority_upsampled])

X_upsampled = upsampled_data.drop('TARGET_B', axis=1)
y_upsampled = upsampled_data['TARGET_B']

rf_classifier_upsampled = RandomForestClassifier(n_estimators=100, random_state=42)

rf_classifier_upsampled.fit(X_upsampled, y_upsampled)

y_pred_upsampled = rf_classifier_upsampled.predict(X_test)

accuracy_upsampled = accuracy_score(y_test, y_pred_upsampled)
print(f"Accuracy after upsampling: {accuracy_upsampled}")

report_upsampled = classification_report(y_test, y_pred_upsampled)
print("Classification Report after upsampling:")
print(report_upsampled)

In [None]:
# It looks like the model's performance improved significantly after upsampling the minority class. The accuracy, precision, 
# recall, and F1-score for both classes ('0' and '1') have notably improved.

# Overall, these metrics suggest that the upsampling method significantly improved the model's ability to predict the minority 
# class without sacrificing the overall performance on the majority class.

#####  Variance Threshold

In [None]:
from sklearn.feature_selection import VarianceThreshold

threshold = 0.1 
variance_threshold = VarianceThreshold(threshold=threshold)

variance_threshold.fit(X_upsampled)

selected_by_variance = variance_threshold.get_support(indices=True)

X_variance_selected = X_upsampled.iloc[:, selected_by_variance]

#### RFE with Random Forest

In [None]:
from sklearn.feature_selection import RFE

rfe = RFE(estimator=rf_classifier_upsampled, n_features_to_select=10)
rfe.fit(X_variance_selected, y_upsampled)

selected_by_rfe = rfe.get_support(indices=True)

X_rfe_selected = X_variance_selected.iloc[:, selected_by_rfe]

####  PCA for Dimensionality Reduction

In [None]:
from sklearn.decomposition import PCA
n_components = 5 
pca = PCA(n_components=n_components)

X_pca = pca.fit_transform(X_rfe_selected)

In [None]:
rf_classifier_feature_selected = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier_feature_selected.fit(X_pca, y_upsampled)

X_test_variance_selected = variance_threshold.transform(X_test)
X_test_rfe_selected = rfe.transform(X_test_variance_selected)
X_test_pca = pca.transform(X_test_rfe_selected)

y_pred_feature_selected = rf_classifier_feature_selected.predict(X_test_pca)

accuracy_feature_selected = accuracy_score(y_test, y_pred_feature_selected)
print(f"Accuracy after feature selection: {accuracy_feature_selected}")

report_feature_selected = classification_report(y_test, y_pred_feature_selected)
print("Classification Report after feature selection:")
print(report_feature_selected)