In [34]:
import glob
import pandas as pd
import re
import nltk
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import accuracy_score

nltk.download('stopwords')
nltk.download('wordnet')
np.random.seed(1000)

from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pierrelardet/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pierrelardet/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Text Classification of Textbooks
### Pierre Lardet

This code is presented as a python notebook, using python 3.11.2. My thoughts are presented chronologically

Versions of libraries used are listed below.


### Data Cleaning

Before anything else, we need to be able to read in the text and convert it into a format which is easy to manipulate. I'm going to use a [Pandas dataframe](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html). I noticed that the Computer Science text files were nested in an extra directory so manually moved them to make the structure of text files consistent. Next, I read in all of the text files in an easily extensible manner, stored them in a 2d array labelled with their subject and created a new Pandas dataframe to make further manipulation easy.

In [2]:
subjects = ['Computer_Science', 'History', 'Maths']
raw_texts = []

def read_subject_data(subject:str)->None:
    for dir in glob.glob(f'./data/{subject}/*.txt'):
        f = open(dir, 'r')
        text = f.read()
        raw_texts.append([subject, text, dir])
        f.close()

for subject in subjects:
    read_subject_data(subject)

In [3]:
textbooks = pd.DataFrame(raw_texts, columns=['subject', 'text' ,'dir'])

textbooks = textbooks[textbooks['text'] != '']

print('-'*50)
print(f'Sample of the data: \n {textbooks.head()}')
print('-'*50)
print(f'Dimensions: {textbooks.shape}')
print('-'*50)
print(f'Counts of each subject: \n {textbooks.groupby("subject").count()}')
print('-'*50)



--------------------------------------------------
Sample of the data: 
             subject                                               text   
0  Computer_Science  4.8 Exercises 275\n4.15 [IS) <§§4.2, 4.3> One ...  \
1  Computer_Science  4.5 Fallacies and Pitfalls 26.\nFirst we find ...   
2  Computer_Science  518 Chapter 7 Large and Fast: Exploiting Memor...   
3  Computer_Science  Computers\nReconstructing the\nin the\nAncient...   
4  Computer_Science  230 Chapter 3 Arithmetic: for Computers\n3.9 [...   

                                               dir  
0  ./data/Computer_Science/Computer_Science291.txt  
1  ./data/Computer_Science/Computer_Science285.txt  
2  ./data/Computer_Science/Computer_Science534.txt  
3  ./data/Computer_Science/Computer_Science252.txt  
4  ./data/Computer_Science/Computer_Science246.txt  
--------------------------------------------------
Dimensions: (1341, 3)
--------------------------------------------------
Counts of each subject: 
               

Now we have a dataframe to work with. The text is currently very messy with lots of extra characters and spacing etc. In order to use the text as an input into a ML classification model, it needs to be much cleaner. The desired format will be a list of lower-case words in each sample which can later be converted to numeric values.

In [133]:
stopwords = nltk.corpus.stopwords.words('english')
lem = WordNetLemmatizer()
in_stopwords = lambda word: word not in stopwords

def clean_text(str:str) -> str:
    str = re.sub(r'\n', ' ', str)
    str = re.sub(r'[^a-zA-Z]+', ' ', str)
    str = str.lower()

    lst_words = str.split(' ')

    lst_words = list(filter(in_stopwords, lst_words))
    lst_words = [lem.lemmatize(word) for word in lst_words]

    return ' '.join(lst_words)

textbooks['text_clean'] = textbooks['text'].apply(clean_text)

print(textbooks.head())

            subject                                               text   
0  Computer_Science  4.8 Exercises 275\n4.15 [IS) <§§4.2, 4.3> One ...  \
1  Computer_Science  4.5 Fallacies and Pitfalls 26.\nFirst we find ...   
2  Computer_Science  518 Chapter 7 Large and Fast: Exploiting Memor...   
3  Computer_Science  Computers\nReconstructing the\nin the\nAncient...   
4  Computer_Science  230 Chapter 3 Arithmetic: for Computers\n3.9 [...   

                                               dir   
0  ./data/Computer_Science/Computer_Science291.txt  \
1  ./data/Computer_Science/Computer_Science285.txt   
2  ./data/Computer_Science/Computer_Science534.txt   
3  ./data/Computer_Science/Computer_Science252.txt   
4  ./data/Computer_Science/Computer_Science246.txt   

                                          text_clean  
0   exercise one user told three program exercise...  
1   fallacy pitfall first find execution time two...  
2   chapter large fast exploiting memory hierarch...  
3  compute

In [146]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(textbooks['text_clean'], textbooks['subject'], test_size=0.3, random_state=4)

Encoder = LabelEncoder()
train_y = Encoder.fit_transform(train_y)
test_y = Encoder.fit_transform(test_y)

Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(textbooks['text_clean'])

train_x = Tfidf_vect.transform(train_x)
test_x = Tfidf_vect.transform(test_x)

selector = SelectKBest(chi2, k = 26900)
train_x = selector.fit_transform(train_x, train_y)
test_x = selector.fit_transform(test_x, test_y)

print(train_x.shape[1])


26900


In [155]:
SVM = svm.SVC(C=1.0, kernel='linear', gamma='auto')
SVM.fit(train_x, train_y)
predictions = SVM.predict(test_x)
print(accuracy_score(test_y, predictions))
print(SVM.coef_.max())

0.9950372208436724


ValueError: WRITEBACKIFCOPY base is read-only