# Hello, 
### Welcome to the Document Classification Notebook.

In this project we will classify text document files from labelled folders and then predict for a single document.
I have made this project as simple as possible to explain.
 



Firstly lets import all our required packages...


In [2]:
import pandas as pd
import os
import spacy
import codecs
import pickle as pkl
import pandas as pd
from datetime import datetime

from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


## Labelling and TfidfVectorizer

Here firstly we will take the folder in which the labelled folders are located and run LabelEncoder()

<h3> LabelEncoder() </h3>
- Basically what you need to understand here is this function converts labels into numeric form  so  
    our ML models can work properly with them. This is an important Preprocessing Step.
  

<h3> TfidfVectorizer() </h3>  
 - Term Frequency Inverse Document Frequency.
             In TfidfVectorizer we consider overall document weightage of a word. It helps us in dealing with   
            most frequent words. TfidfVectorizer weights the word counts by a measure of how often they appear in the documents.    
 
     


In [3]:
#Label Encoding
TRAIN_DATA_DIR = "data"
data_dir = TRAIN_DATA_DIR 

def set_labels():
    categories = [f.lower() for f in os.listdir(data_dir)
                    if os.path.isdir(os.path.join(data_dir, f))]
    le = LabelEncoder()
    le.fit(categories)
    print( "Categories (Labels) : ", categories)
    return le

label_encoder = set_labels()

# TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=2000, decode_error='replace', min_df=2)



Categories (Labels) :  ['business', 'entertainment', 'politics', 'sport', 'tech']


### Text Processing
We will now preprocess the text data.  
We have two methods inside a DataProcessing Class.  

process_data() - In this function we are taking the text files and the labels and storing them in a dataframe.  
We are applying tfidfVectorizer on our dataset.

process_test_data() - same for testing for sample input

In [4]:
class DataProcessing:

    def process_data(self):
        print(" Processing data ")
        dataset = pd.DataFrame()
        categories = label_encoder.classes_
        # Loop through the Directory , loop through the files, process them like removing spaces, making them lowercase 
        for cat in categories: # folders
            nlp = spacy.load('en_core_web_sm')
            for f in os.listdir(os.path.join(TRAIN_DATA_DIR, cat)): # files in a folder
                try:
                    f_read = open(os.path.join(TRAIN_DATA_DIR, cat, f), 'r') 
                    
                    doc = " ".join( [ line.strip().lower() for line in f_read ] )
                    doc_vec = " ".join([ word.lemma_ for word in nlp(doc)
                                         if not (word.is_space or word.is_stop or word.is_punct or ('\n' in word.text) ) ] )
                
                    label = pd.Series(label_encoder.transform([cat]), dtype='Int64')
        
                    if len(doc_vec.strip()) > 0:
                        dataset = dataset.append({'sentences': doc_vec, 'labels': label[0]}, ignore_index=True)
              
                except ValueError as e:
                    print(f)
                    print(e)

        #Creating the DataFrame and applying tfidf to the texts.    
        vectors = pd.DataFrame(tfidf_vectorizer.fit_transform(dataset['sentences']).toarray())
        dataset = dataset.join(vectors)
        #Splitting the Dataset into Train and Test
        data_train, data_test = train_test_split(dataset, test_size=0.3, random_state=1234)
        #print(data_train.head())
        #print(data_test.head())
        return data_train, data_test

    # process test phrases on new data
    def process_test_data(self, test_data):
        test_files = []
        if os.path.isdir(test_data):
            test_files += [os.path.join(test_data, f) for f in os.listdir(test_data)]
        else:
            test_files.append(test_data)
        
        test_dataset = pd.DataFrame()
        nlp = spacy.load('en_core_web_sm')
        
        for f in test_files:
            doc = " ".join([line.lower() for line in open(f, 'r', encoding='utf-8')])
            doc_vec = " ".join([word.lemma_ for word in nlp(doc)
                                if not (word.is_space or word.is_stop or word.is_punct or ('\n' in word.text))])
            
            test_dataset = test_dataset.append({'sentences': doc_vec, 'file_names': f}, ignore_index=True)
        
        return test_dataset.join(pd.DataFrame(tfidf_vectorizer.transform(test_dataset['sentences']).toarray()))

## Now lets call our Class
 and call the process_data() method on the dataset
 

In [5]:
proc = DataProcessing()
encoder = label_encoder
vectorizer = tfidf_vectorizer
data_train, data_test = proc.process_data()  


 Processing data 


In [6]:
data_train.head(3)

Unnamed: 0,sentences,labels,0,1,2,3,4,5,6,7,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
555,howl help boost japan cinema japan box office ...,1.0,0.0,0.0,0.0,0.0,0.0,0.060623,0.0,0.0,...,0.0,0.0,0.0,0.116785,0.422422,0.0,0.0,0.0,0.0,0.0
1825,china net cafe culture crackdown chinese autho...,4.0,0.029559,0.0,0.040124,0.0,0.076752,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.082146,0.0,0.0,0.0
263,french suitor hold lse meeting european stock ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.038002,0.0,0.0,0.0,0.0


In [7]:
data_test.head(3)

Unnamed: 0,sentences,labels,0,1,2,3,4,5,6,7,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
1014,lord scarman 93 die peacefully distinguished l...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017538,0.0,0.0,0.042417,0.0,0.0,0.0
2136,high definition dvds humble home video dvd hol...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.009927,0.0,0.0,0.0,0.0,0.0,0.0
441,lesotho textile worker lose job foreign own te...,0.0,0.043979,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Model Generation
Lets create a class and make our models as methods.
We will use GridSearchCV for Hyperparameter Tuning.

We have used Naive Bayes, KNN , SVM , RandomForest.




In [8]:
class ModelGenerator:

    def __init__(self, data_train, data_test=None):
        self.data_train = data_train
        self.data_test = data_test

    def naive_bayes(self):
        model = MNB()
        params = {}
        return self.cross_validation(model,params)

    def knn(self): 
        model = KNN()
        params = {'n_neighbors': [3, 4, 5, 6],
                  'weights': ['uniform', 'distance'], 
                  'metric': ['euclidean', 'manhattan']}
        return self.cross_validation(model, params) 

    def svm(self):
        model = SVC()
        params = {'C': [0.1, 1, 10, 100, 1000],
                  'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                  'kernel': ['rbf']}
        return self.cross_validation(model, params)

    def random_forest(self):
        model = RFC() 
        params = {'n_estimators': [100, 200, 400],
                  'criterion': ['gini', 'entropy'],
                  'max_features': ['auto', 'sqrt', 'log2']}
        return self.cross_validation(model, params) 

    def cross_validation(self, model, params):
        cols = self.data_train.columns.difference(['labels', 'sentences'])
        cv = KFold(n_splits=10, random_state=1, shuffle=True)
        gcv = GridSearchCV(estimator=model, param_grid=params, scoring='accuracy', n_jobs=-1, cv=cv)
        gcv.fit(self.data_train[cols], self.data_train['labels'])
        return gcv
        

## Model Initialization
Creating our models now .

In [9]:
# generate and train models 
model_gen = ModelGenerator(data_train) 
nb = model_gen.naive_bayes()
knn = model_gen.knn()      
svm = model_gen.svm() 
rf = model_gen.random_forest() 

models = {'nb': nb, 'knn': knn, 'svm': svm, 'rf':rf}

## Saving our models and Evaluations
Save all the models by just calling the save function.  
We will print a classification report to see the details of the models

In [28]:
SAVE_MODEL_PATH = "save_model"  



def evaluate_model(cl_model, test_data):
    print(cl_model.best_estimator_) 
    cols = test_data.columns.difference(['sentences', 'labels'])
    predictions = list(map(int, cl_model.predict(test_data[cols])))
    c_report = classification_report(test_data['labels'], predictions)
    print(c_report)

def save_model(name, model_files):
    filename = os.path.join(SAVE_MODEL_PATH, name + '.pkl')
    with open(filename, 'wb') as fh:
        pkl.dump(model_files, fh)



In [29]:
print("Naive Bayes: ")
evaluate_model(models['nb'], data_test )

Naive Bayes: 
MultinomialNB()
              precision    recall  f1-score   support

         0.0       0.97      0.92      0.95       172
         1.0       0.97      0.95      0.96       113
         2.0       0.90      0.97      0.93       115
         3.0       0.98      0.99      0.99       140
         4.0       0.95      0.95      0.95       128

    accuracy                           0.96       668
   macro avg       0.95      0.96      0.95       668
weighted avg       0.96      0.96      0.96       668



In [30]:
print("KNN: ")
evaluate_model(models['knn'], data_test )

KNN: 
KNeighborsClassifier(metric='euclidean', n_neighbors=6, weights='distance')
              precision    recall  f1-score   support

         0.0       0.95      0.86      0.91       172
         1.0       0.95      0.89      0.92       113
         2.0       0.84      0.97      0.90       115
         3.0       0.95      0.99      0.97       140
         4.0       0.95      0.94      0.94       128

    accuracy                           0.93       668
   macro avg       0.93      0.93      0.93       668
weighted avg       0.93      0.93      0.93       668



In [31]:
print("SVM: ")
evaluate_model(models['svm'], data_test )

SVM: 
SVC(C=1, gamma=1)
              precision    recall  f1-score   support

         0.0       0.98      0.91      0.94       172
         1.0       0.97      0.99      0.98       113
         2.0       0.91      0.96      0.93       115
         3.0       0.97      0.99      0.98       140
         4.0       0.97      0.95      0.96       128

    accuracy                           0.96       668
   macro avg       0.96      0.96      0.96       668
weighted avg       0.96      0.96      0.96       668



In [32]:
print("Random Forest: ")
evaluate_model(models['rf'], data_test )

Random Forest: 
RandomForestClassifier(max_features='log2', n_estimators=200)
              precision    recall  f1-score   support

         0.0       0.95      0.94      0.95       172
         1.0       0.95      0.94      0.95       113
         2.0       0.93      0.94      0.94       115
         3.0       0.95      0.99      0.97       140
         4.0       0.97      0.94      0.95       128

    accuracy                           0.95       668
   macro avg       0.95      0.95      0.95       668
weighted avg       0.95      0.95      0.95       668



In [11]:
# save models - save to pickle file : model object & processor object
for name,model in models.items():
    save_model(name , { 'model': model, 'processor': proc })



## Testing the data
for samples.

In [14]:
TEST_DATA_DIR = "test_data"
   

# load a save model and predict from given test files
def run_save_model(model_file_path):
    model_files = pkl.load(open(model_file_path, 'rb'))
    processor = model_files['processor']
    model = model_files['model']
    
    test_data = processor.process_test_data(TEST_DATA_DIR) # get file names
    cols = test_data.columns.difference(['sentences', 'file_names']) #take all colms except 2
    predict = list(map(int, model.predict(test_data[cols]))) 

    results = pd.DataFrame({'file_name': test_data['file_names'], 
                        'classification': label_encoder.inverse_transform(predict)}) # inverse transform on encoder to get classes label 
    return results


saved_model_path = 'save_model/nb.pkl'
results = run_save_model(saved_model_path)


## Results!
Woohoo we have successfully classified the text files and trained our models with model tuning.
 

In [15]:
results

Unnamed: 0,file_name,classification
0,test_data\008.txt,entertainment
1,test_data\business.txt,business
2,test_data\entertainment.txt,entertainment
3,test_data\sport.txt,sport
4,test_data\tech.txt,tech
