In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Step 1: Load the training data
path = 'credit_training.csv'
df = pd.read_csv(path, sep=',', encoding='ISO-8859-1', quotechar='"', nrows=1000)

# Display the first few rows of the training data
print("Training Data (First 5 rows):")
print(df.head())

# Step 2: Load the testing data
testing_path = 'credit_testing.csv'
testing_data = pd.read_csv(testing_path, sep=',', encoding='ISO-8859-1', quotechar='"')

# Display the first few rows of the testing data
print("\nTesting Data (First 5 rows):")
print(testing_data.head())

# Step 3: Keep only the required fields (just showing that it is kept)
data = df[['OBS_ID']]

# Step 4: Drop rows with missing values
data = data.dropna()

# Display rows without missing values
print("\nData after dropping missing values (First 5 rows):")
print(data.head())

# Step 5: Extract features and target variable
X_train = df[['CHK_ACCT', 'DURATION', 'HISTORY', 'NEW_CAR', 'USED_CAR', 'FURNITURE', 'RADIO/TV', 'EDUCATION', 'RETRAINING']]
y_train = df['RESPONSE']

# Display features and target variable
print("\nFeatures (First 5 rows):")
print(X_train.head())
print("\nTarget variable (First 5 rows):")
print(y_train.head())

# Step 6: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=9)

# Display shapes of training and testing sets
print(f"\nX_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

# Step 7: Perform feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Display scaled training data
print("\nScaled Training Features (First 5 rows):")
print(X_train_scaled[:5])

# Step 8: Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 250, 300],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Step 9: Create the RandomForestClassifier
random_forest_classifier = RandomForestClassifier(class_weight='balanced', random_state=42)

# Step 10: Instantiate the grid search with the RandomForestClassifier and parameter grid
grid_search = GridSearchCV(random_forest_classifier, param_grid, cv=5, scoring='accuracy')

# Step 11: Fit the grid search to the data
grid_search.fit(X_train_scaled, y_train)

# Step 12: Get the best parameters
best_params = grid_search.best_params_

# Display best hyperparameters
print("\nBest Hyperparameters:")
print(best_params)

# Step 13: Train the model with the best parameters
best_random_forest_classifier = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    class_weight='balanced',
    random_state=42
)
best_random_forest_classifier.fit(X_train_scaled, y_train)

# Step 14: Make predictions on the test set
y_pred = best_random_forest_classifier.predict(X_test_scaled)

# Step 15: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

# Display accuracy and classification report
print(f"\nAccuracy: {accuracy}")
print("\nClassification Report:")
print(classification_report_result)

# Step 16: Make predictions on the testing set
testing_data_scaled = scaler.transform(testing_data[['CHK_ACCT', 'DURATION', 'HISTORY', 'NEW_CAR', 'USED_CAR', 'FURNITURE', 'RADIO/TV', 'EDUCATION', 'RETRAINING']])
y_pred_testing = best_random_forest_classifier.predict(testing_data_scaled)

# Step 17: Create a DataFrame with the predictions
predictions_df = pd.DataFrame({'ID': testing_data['OBS_ID'], 'score_level': y_pred_testing})

# Step 18: Save predictions to a CSV file
predictions_df.to_csv('/submission.csv', index=False)

# Step 19: Display the predictions DataFrame
print("\nPredictions on Testing Data (First 5 rows):")
print(predictions_df.head())


Training Data (First 5 rows):
   OBS_ID  CHK_ACCT  DURATION  HISTORY  NEW_CAR  USED_CAR  FURNITURE  \
0     860         3         9        2        1         0          0   
1     704         1        30        3        0         0          0   
2     791         1        21        2        0         0          0   
3     385         3        30        3        0         0          0   
4     406         1        24        2        0         0          0   

   RADIO/TV  EDUCATION  RETRAINING  ...  AGE  OTHER_INSTALL  RENT  OWN_RES  \
0         0          0           0  ...   26              0     1        0   
1         0          0           1  ...   41              1     0        1   
2         0          0           1  ...   39              0     0        1   
3         0          0           1  ...   26              0     0        1   
4         1          0           0  ...   22              0     0        1   

   NUM_CREDITS  JOB  NUM_DEPENDENTS  TELEPHONE  FOREIGN  RESPONSE  


In [None]:
from google.colab import drive
drive.mount('/content/drive')