# Text Classification Using Random Forest

In this notebook, we will perform text classification using the **Random Forest** classifier. The data is loaded from a CSV file named **complaints_processed.csv**.


In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import re
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix
import matplotlib.pyplot as plt
import os


In [None]:
# Load the data
df = pd.read_csv('complaints_processed.csv', index_col=0)
df.head()

In [None]:
# Display dataset information
print(f'--> There are {df.shape[0]} rows and {df.shape[1]} columns')
print('\n===========================================================\n')
print('--> Missing Values:\n\n', df.isna().sum())
print('\n===========================================================\n')
print('Product Counts:\n\n', df['product'].value_counts())


In [None]:
# Drop missing values as they are minimal
df.dropna(axis=0, inplace=True)

# Display class distribution in original dataset
print('For Actual dataset:\n\n', df['product'].value_counts() * 100 / len(df))


In [None]:
# Sampling the data for faster processing
data = df[['product', 'narrative']].sample(n=10000)

# Display class distribution in sample dataset
print('Sample dataset:\n\n', data['product'].value_counts() * 100 / len(data))


In [None]:
# Define a function to clean text data
stopwords = nltk.corpus.stopwords.words('english')

def text_clean(text):
    """
    This function performs the following tasks:
    1. Converts text to lowercase
    2. Removes digits
    3. Removes words with fewer than 3 characters
    4. Removes stopwords
    """
    clean_words = []
    word_list = text.split()
    for word in word_list:
        word_l = word.lower().strip()
        if word_l.isalpha() and len(word_l) > 3 and word_l not in stopwords:
            clean_words.append(word_l)
    return clean_words


In [None]:
# Creating the Term Document Matrix
tfidf = TfidfVectorizer(analyzer=text_clean)
x_tfidf = tfidf.fit_transform(data['narrative'])

# Display shape of matrix
print("Shape of Term Document Matrix:", x_tfidf.shape)


In [None]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(x_tfidf, data['product'], test_size=0.2, random_state=42)

# Train RandomForest Classifier
rfc = RandomForestClassifier(n_jobs=-1)
rfc_model = rfc.fit(X_train, y_train)

# Predict on test set
prediction = rfc_model.predict(X_test)

# Display Confusion Matrix and Classification Report
print("Confusion Matrix:\n\n", confusion_matrix(y_test, prediction))
print("\n")
print("Classification Report:\n\n", classification_report(y_test, prediction))


In [None]:
# Plotting the normalized confusion matrix
class_names = ['credit_card', 'credit_reporting', 'debt_collection', 'mortgages_and_loans', 'retail_banking']
plt.figure(figsize=(10, 10))
plot_confusion_matrix(rfc, X_test, y_test, display_labels=class_names, cmap=plt.cm.Blues, normalize='true')
plt.xticks(rotation=90)
plt.title("Normalized Confusion Matrix")
plt.show()
