# Text Classification using Classical ML algorithms

Run traditional machine learning models on text data for prediction  

Author: Jenna Kim & Jinseok Kim 
Created: 2022/1/12  
Last Modified: 2023/10/08 



# 1. Set up

In [None]:
import timeit
import pandas as pd
import nltk

from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# Install nltk
import nltk
nltk.download('punkt')

# Install Imbalanced-Learn library for sampling if not already installed
!pip install imbalanced-learn

In [None]:
# Hide warning messages from display
import warnings
warnings.filterwarnings('ignore')

# 2. Functions

In [None]:
def load_data(filename, colname, record):
    """
    Read in input file and load data
    
    filename: csv file
    colname: column name for texts
    record: text file to save summary
    
    """
    
    df = pd.read_csv(filename, encoding='utf-8')
    
    # No of rows and columns
    print("No of Rows: {}".format(df.shape[0]), file=record)
    print("No of Columns: {}".format(df.shape[1]), file=record)
    
    print("No of Rows: {}".format(df.shape[0]))
    print("No of Columns: {}".format(df.shape[1]))
    
    # Select data needed for processing & convert labels
    df = df[['pmid', 'title', 'abstract', 'pubtype']]
    df.iloc[:, -1] = df.iloc[:, -1].map({'RCT':1, 'Other':0})

    # Remove null values 
    df=df.dropna()

    print("No of rows (After removing null): {}".format(df.shape[0]), file=record)
    print("No of columns: {}".format(df.shape[1]), file=record)
    
    print("No of rows (After removing null): {}".format(df.shape[0]))
    print("No of columns: {}".format(df.shape[1]))

    # Select text columns
    if colname == "title":
        df = df[['pmid', 'title', 'pubtype']]
        df.rename({"title": "sentence", "pubtype": "label"}, axis=1, inplace=True)
    elif colname == "abs":
        df = df[['pmid', 'abstract', 'pubtype']]
        df.rename({"abstract": "sentence", "pubtype": "label"}, axis=1, inplace=True)
    elif colname == "mix":
        df['mix'] = df[['title','abstract']].apply(lambda x : '{} {}'.format(x[0],x[1]), axis=1)
        df = df[['pmid', 'mix', 'pubtype']]
        df.rename({"mix": "sentence", "pubtype": "label"}, axis=1, inplace=True)

    # Check the first few instances
    print("\n<Data View: First Few Instances>", file=record)
    print("\n", df.head(), file=record)
    print("\n<Data View: First Few Instances>")
    print("\n", df.head()) 
    
    # No of lables and rows 
    print('\nClass Counts(label, row): Total', file=record)
    print(df.label.value_counts(), file=record)
    
    print('\nClass Counts(label, row): Total')
    print(df.label.value_counts())

    # Split into X and y
    X, y = df.iloc[:, :-1], df.iloc[:, -1]
     
    return X, y

In [None]:
def sample_data(X_train, y_train, record, sampling=0, sample_method='over'):
    """
       Sampling input train data
       
       X_train: dataframe of X train data
       y_train: datafram of y train data
       sampling: indicator of sampling funtion is on or off
       sample_method: method of sampling (oversampling or undersampling)
       record: text file to save summary
       
    """
    
    from imblearn.over_sampling import RandomOverSampler
    from imblearn.under_sampling import RandomUnderSampler
    
    if sampling:
        # select a sampling method
        if sample_method == 'over':
            oversample = RandomOverSampler(random_state=42)
            X_over, y_over = oversample.fit_resample(X_train, y_train)
            print('\n****** Data Sampling ******', file=record)
            print('\nOversampled Data (class, Rows):\n{}'.format(y_over.value_counts()), file=record)
            print('\nOversampled Data (class, Rows):\n{}'.format(y_over.value_counts()))
            X_train_sam, y_train_sam = X_over, y_over
            
        elif sample_method == 'under':
            undersample = RandomUnderSampler(random_state=42)
            X_under, y_under = undersample.fit_resample(X_train, y_train)
            print('\n****** Data Sampling ******', file=record)
            print('\nUndersampled Data (class,Rows):\n{}'.format(y_under.value_counts()), file=record)
            print('\nUndersampled Data (class,Rows):\n{}'.format(y_under.value_counts()))
            X_train_sam, y_train_sam = X_under, y_under
    else:
        X_train_sam, y_train_sam = X_train, y_train 
        print('\n****** Data Sampling ******', file=record)
        print('\nNo Sampling Performed\n', file=record)
    
    return X_train_sam, y_train_sam

In [None]:
def preprocess_data(X_data_raw):
    """
       Preprocess data with lowercase conversion, punctuation removal, tokenization, stemming
       
       X_data_raw: X data in dataframe
       
    """
    
    X_data=X_data_raw.iloc[:, -1].astype(str)
   
    # .1 convert all characters to lowercase
    X_data = X_data.map(lambda x: x.lower())
    
    # 2. remove punctuation
    X_data = X_data.str.replace('[^\w\s]', '')
    
    # 3. word tokenize
    X_data = X_data.apply(nltk.word_tokenize)
    
    # 4. stemming
    stemmer = PorterStemmer()
    X_data = X_data.apply(lambda x: [stemmer.stem(y) for y in x])
    
    # 5. removing unnecessary space
    X_data = X_data.apply(lambda x: ' '.join(x)) 
    
    return X_data

In [None]:
def fit_model(X_train, y_train, model='DT'):
    
    """
      Model fitting with options of classifiers:
      decision tree, svm, knn, naive bayes, random forest, and gradient boosting
      
      X_train: X train data
      y_train: y train data
      model: name of classifier
      
    """
    
    if model=='DT':
        DT = DecisionTreeClassifier(max_depth=2)
        model = DT.fit(X_train, y_train)
    elif model=='SVM':
        SVM = SVC(kernel='linear', probability=True)  
        model = SVM.fit(X_train, y_train)
    elif model=='NB':
        NB = MultinomialNB()
        model = NB.fit(X_train, y_train)
    elif model=='LR':
        LR = LogisticRegression()
        model = LR.fit(X_train, y_train)   
    elif model=='RF':
        RF = RandomForestClassifier(max_depth=2, random_state=0)
        model = RF.fit(X_train, y_train)
    elif model=='GB':
        GB = GradientBoostingClassifier()
        model = GB.fit(X_train, y_train)
    
    return model

In [None]:
def evaluate_model(y_test, y_pred, record, eval_model=0):
    """
      evaluate model performance
      
      y_test: y test data
      y_pred: t prediction score
      eval_model: indicator if this funtion is on or off
      
    """
    
    if eval_model:
        print('\n************** Model Evaluation **************', file=record)
        print('\n************** Model Evaluation **************')
        
        print('\nConfusion Matrix:\n', file=record)
        print(confusion_matrix(y_test, y_pred), file=record)
        print('\nConfusion Matrix:\n')
        print(confusion_matrix(y_test, y_pred))
    
        print('\nClassification Report:\n', file=record)
        print(classification_report(y_test, y_pred, digits=4), file=record)
        print('\nClassification Report:\n')
        print(classification_report(y_test, y_pred, digits=4))

In [None]:
def predict_proba(model, X_test_trans, X_test, y_test, y_pred, proba_file, proba_out=0):
    """
       Predict probability of each class
       
       model: trained model with a selected classifier
       X_test_trans: X test data preprocessed
       X_test: original X test data
       y_test: original y test data
       y_pred: predicted y values
       proba_file: output file of probability scores
       proba_out: decide if the probability output is expected
       
    """
    if proba_out:
      
        ## Compute probability
        y_prob = model.predict_proba(X_test_trans)
        df_prob = pd.DataFrame(data=y_prob, columns=model.classes_)
        result = pd.concat([X_test.reset_index(drop=True), df_prob], axis=1, ignore_index=False)
    
        ## Add predicted class to output
        result['pred'] = pd.Series(y_pred)

        ## Add actual class to output 
        y_test = y_test.reset_index(drop=True)
        result['act'] = y_test

        ## Save output
        result.to_csv(proba_file, encoding='utf-8', index=False, header=True)

# 3. Main Function

In [None]:
def main(input_file, 
         colname,   
         sample_on, 
         sample_type, 
         model_method, 
         eval_on, 
         proba_file,
         proba_on,
         result_file,
         datasize_change,
         ratio):
    
    """
       Main function for processing data, model fitting, and prediction
       
       input_file: input file
       colname: colume name for selection between title and abstract
       sample_on: indicator of sampling on or off
       sample_type: sample type to choose if sample_on is 1
       model_method: name of classifier to be applied for model fitting
       eval_on: indicator of model evaluation on or off
       proba_file: name of output file of probability
       proba_on: indicator of getting probability
       result_file: name of output file of evaluation
       datasize_change: indication of data size change
       ratio: proportion of data size
       
    """
    ## 0. open result file for records
    f=open(result_file, "a")
    
    ## 1. Load data
    
    print("\n************** Loading Data ************\n", file=f)
    print("\n************** Loading Data ************\n")
    X, y = load_data(input_file, colname, record=f)
    
    # testing
    print("\n<First Sentence>\n{}".format(X.sentence[0]), file=f)
    print("\n<First Sentence>\n{}".format(X.sentence[0]))

    ## 2. Train and test split
    
    print("\n************** Spliting Data **************\n", file=f)
    print("\n************** Spliting Data **************\n")
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_test,y_test, test_size=0.5, random_state=42, stratify=y_test)
    
    # For testing only: small size data
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.99, random_state=42, stratify=y)
    #X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.99, random_state=42, stratify=y_test)
    #X_notuse, X_test, y_notuse, y_test = train_test_split(X_test, y_test, test_size=0.01, random_state=42, stratify=y_test)
    
    print("Train Data: {}".format(X_train.shape), file=f)
    print("Val Data: {}".format(X_val.shape), file=f)
    print("Test Data: {}".format(X_test.shape), file=f)
    
    print("Train Data: {}".format(X_train.shape))
    print("Val Data: {}".format(X_val.shape))
    print("Test Data: {}".format(X_test.shape))
    
    print('\nClass Counts(label, row): Train', file=f)
    print(y_train.value_counts(), file=f)
    print('\nClass Counts(label, row): Test', file=f)
    print(y_test.value_counts(), file=f)
    
    print("\n<X_test Data>", file=f)
    print(X_test.head(), file=f)
    print("\n<X_test Data>")
    print(X_test.head())
    
    ## 3. Data size change
    
    if datasize_change:
        
        print("\n************** Data Size Change *************\n", file=f)
        print("Data Ratio (size): {} ({})".format(ratio, int(X_train.shape[0]*ratio)), file=f)
        print("\n************** Data Size Change *************\n")
        print("Data Size: {} ({})".format(ratio, int(X_train.shape[0]*ratio)))
        
        X_train, _, y_train, _ = train_test_split(X_train, y_train, test_size=(1-ratio), random_state=42, stratify=y_train)  
    
    # Reset index
    X_train=X_train.reset_index(drop=True)
    X_test=X_test.reset_index(drop=True)
    y_train=y_train.reset_index(drop=True)
    y_test=y_test.reset_index(drop=True)
    
    print("\n************** Processing Data **************", file=f)
    print("\n************** Processing Data **************")
    print("\nTrain Data: {}".format(X_train.shape), file=f)
    print("Test Data: {}".format(X_test.shape), file=f)
    print("\nTrain Data: {}".format(X_train.shape))
    print("Test Data: {}".format(X_test.shape))
    
    print('\nClass Counts(label, row): Train', file=f)
    print(y_train.value_counts(), file=f)
    print('\nClass Counts(label, row): Test', file=f)
    print(y_test.value_counts(), file=f)
    
    print("\n<X_test Data>", file=f)
    print(X_test.head(), file=f)
    print("\n<X_test Data>")
    print(X_test.head())
    
    ## 4. Sampling 
    X_train_samp, y_train_samp = sample_data(X_train, y_train, record=f, sampling=sampling_on, sample_method=sample_type)
    
    ## 5. Preprocessing 
    X_train_pro = preprocess_data(X_train_samp)
    
    # TFIDF transformation
    count_vect = CountVectorizer()
    counts = count_vect.fit_transform(X_train_pro)
    transformer = TfidfTransformer(smooth_idf=True, use_idf=True).fit(counts)
    X_train_transformed = transformer.transform(counts)
    
    X_train_trans = X_train_transformed
    y_train_trans = y_train_samp

    ## 6. Model Fitting
    print("\n************** Training Model: " + model_method + " **************", file=f)
    print("\n************** Training Model: " + model_method + " **************")

    # Check training time
    start_time = timeit.default_timer()
    
    # Fit the model
    model = fit_model(X_train_trans, y_train_trans, model=model_method)
    
    elapsed = timeit.default_timer() - start_time
    print("\nTraining Time: {}".format(round(elapsed, 2)), file=f)
    print("\nTraining Time: {}".format(round(elapsed,2)))

    ## 7. Prediction
    print("\n\n************** Getting predictions **************", file=f)
    print("\n\n************** Getting predictions **************")

    # Transform X_test data
    X_test_pro = preprocess_data(X_test)
    counts_test = count_vect.transform(X_test_pro)
    X_test_trans = transformer.transform(counts_test)
    
    # Predict output
    y_pred = model.predict(X_test_trans)
    
    ## 8. Evaluating model performance
    print("\n************** Evaluating performance **************", file=f)
    print("\n************** Evaluating performance **************")
    evaluate_model(y_test, y_pred, record=f, eval_model=eval_on)
    
    ## 9. Probability prediction    
    predict_proba(model, X_test_trans, X_test, y_test, y_pred, proba_file=proba_file, proba_out=proba_on)
    
    print("\nOutput file:'" + result_file + "' Created", file=f)
    print("\nOutput file:'" + result_file + "' Created")
    
    f.close()

# 4. Run code for implementation


In [None]:
%%time

if __name__== "__main__":
    
    ###### 1. Set Parameter Values ######
    
    #### 1-1. Input file name & which column
    input_filename="rct_sample.csv"  
    column_name = "abs"                                        # 'title' for title text; 'abs' for abstract; 'mix' for title + abstract

    #### 1-2. Data size change?
    datachange_on=1                                            # 0 for no change; 1 for change of data size
    ratio_list=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]   # ratio for data size
    #ratio_list=[0.1]  # for testing
    
    #### 1-3. Sampling applied?
    sampling_on=0                                              # 0 for no sampling; 1 for sampling
    sampling_type='over'                                       # Use when sampling_on=1; 'over'(oversampling), 'under'(undersampling)
    
    #### 1-4. Which model to use?
    model_type='LR'                                            # 'LR'(Logisitic regression);SVM'(SVM);'NB'(Naive Bayes);
                                                               # 'RF'(Random Forest);'GB'(Gradient Boosting)
    #### 1-5. Evaluation & probability file    
    eval_on=1                                                  # 0 for no; 1 for yes (confusion matrix/classification report)
    proba_on=0                                                 # 0 for no; 1 for yes (probability output)
    
    
    ###### 2. Run Main Fuction ######

    if datachange_on:            
        
        for ratio in ratio_list:           
            if sampling_on:
                proba_file = "result_ml_" + str(ratio) + "_" + model_type + "_" + sampling_type + "_" + column_name + ".csv" 
                eval_file = "eval_ml_" + str(ratio) + "_" + model_type + "_" + sampling_type + "_" + column_name + ".txt" 
            else:
                proba_file = "result_ml_" + str(ratio) + "_" + model_type + "_" + column_name + ".csv"   
                eval_file = "eval_ml_" + str(ratio) + "_" + model_type + "_" + column_name + ".txt"
            main(input_file=input_filename,
                 colname=column_name, 
                 sample_on=sampling_on, 
                 sample_type=sampling_type,
                 model_method=model_type, 
                 eval_on=eval_on, 
                 proba_file=proba_file,
                 proba_on=proba_on,
                 result_file=eval_file,
                 datasize_change=datachange_on,
                 ratio=ratio)
    else:
        if sampling_on:
            proba_file = "result_ml_all_" + model_type + "_" + sampling_type + "_" + column_name + ".csv"    
            eval_file = "eval_ml_all_" + model_type + "_" + sampling_type + "_" + column_name + ".txt" 
        else:
            proba_file = "result_ml_all_" + model_type + "_" + column_name + ".csv" 
            eval_file = "eval_ml_all_" + model_type + "_" + column_name + ".txt" 
            
        main(input_file=input_filename, 
             colname=column_name,
             sample_on=sampling_on, 
             sample_type=sampling_type,
             model_method=model_type, 
             eval_on=eval_on, 
             proba_file=proba_file,
             proba_on=proba_on,
             result_file=eval_file,
             datasize_change=datachange_on,
             ratio=1)
        
    print("\n************** Processing Completed **************\n")