In [60]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
import json
import re
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


In [69]:
def load_data(file_path, chunk_size=1000):
    data = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            chunk = []
            for i, line in enumerate(f):
                chunk.append(json.loads(line.strip()))
                if (i + 1) % chunk_size == 0:
                    data.extend(chunk)
                    chunk = []  
            if chunk:
                data.extend(chunk)
    except Exception as e:
        print(f"Error loading data: {e}")
    return data


In [70]:
def prepare_dataframe(data):
    rows = []
    labels = []

    for document in data:
        for row in document:
            values = [cell['value'] for cell in row['values']]
            rows.append(values)
            labels.append(1 if row.get('type') == 'HEADERS' else 0)

    df = pd.DataFrame(rows)
    labels = np.array(labels)

    return df, labels

In [77]:
def extract_text_features(df):
    features = pd.DataFrame()

    features['num_columns'] = df.apply(lambda row: sum(1 for cell in row if pd.notna(cell) and cell != ''), axis=1)
    features['text_length'] = df.apply(lambda row: sum(len(str(cell)) for cell in row), axis=1)
    features['digit_count'] = df.apply(lambda row: sum(char.isdigit() for cell in row for char in str(cell)), axis=1)
    features['capitalization'] = df.apply(lambda row: sum(cell.isupper() for cell in row if isinstance(cell, str)), axis=1)
    features['special_chars'] = df.apply(lambda row: sum(1 for cell in row if isinstance(cell, str) and re.search(r'[^a-zA-Z0-9\s]', cell)), axis=1)

    return features


In [78]:
def handle_class_imbalance(X, y):
    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X, y)
    return X_res, y_res

In [79]:
def train_model(X, y):
    model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight='balanced')
    pipeline = Pipeline([('smote', SMOTE(random_state=42)), ('rf', model)])
    pipeline.fit(X, y)
    return pipeline

In [80]:
def predict_headers(documents, model):
    predictions = []

    for document in documents:
        rows = [row['values'] for row in document]
        df = pd.DataFrame([[cell['value'] for cell in row] for row in rows])

        X_test = extract_text_features(df)

        y_pred = model.predict(X_test)

        for row, pred in zip(document, y_pred):
            row['predicted_type'] = 'HEADER' if pred == 1 else 'NON-HEADER'

        predictions.append(document)

    return predictions


In [81]:
def evaluate_model(X_test, y_test, model):
    y_pred = model.predict(X_test)
    print("Evaluation Results:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [82]:
if __name__ == "__main__":
    training_file_path = 'C:\\Users\\Susma\\Documents\\Spacequant\\document-standardization-training-dataset\\document-standardization-training-dataset.txt'

    training_data = load_data(training_file_path, chunk_size=1000)
    df_train, labels = prepare_dataframe(training_data)

    X_train = extract_text_features(df_train)

    X_resampled, y_resampled = handle_class_imbalance(X_train, labels)

    model = train_model(X_resampled, y_resampled)

    joblib.dump(model, 'C:\\Users\\Susma\\Documents\\Spacequant\\document-standardization-training-dataset\\header_recognition_model.pkl')

    print("Model trained and ready for future test data.")

    evaluate_model(X_resampled, y_resampled, model)

    

Model trained and ready for future test data.
Evaluation Results:
Accuracy: 0.922562582372171
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.92      0.92   2441662
           1       0.92      0.93      0.92   2441662

    accuracy                           0.92   4883324
   macro avg       0.92      0.92      0.92   4883324
weighted avg       0.92      0.92      0.92   4883324

Confusion Matrix:
 [[2234424  207238]
 [ 170914 2270748]]


In [None]:
# Future Testing: when test data is available, load and predict as below
    # test_data = load_data(test_file_path)  
    # predictions = predict_headers(test_data, model)
    # for doc in predictions:
    #     print(doc)  