In [1]:
#import nltk
#nltk.download('averaged_perceptron_tagger')

In [2]:
#CSC620
#HA9 -- sklearn's Pipeline and FeatureUnion
#Paula Abigail Tam
#a. Uses Logistic Regression instead of Random Forest classifier.
#b. Adds three new features: Number of adjectives, Number of nouns, Number of verbs. 
#c. At the end of the blocks 9 and 13, add the function 'classification_report()' to print the detailed evaluation report (prec, recall, f1 etc.).

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

df = pd.read_csv('./HA9_csv/train.csv') #My path to the train.csv file is this
#('../input/train.csv') 

df.dropna(axis=0) #.dropna removes any missing values, so there is no empty values in the data
    #documentation: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html
df.set_index('id', inplace = True)

df.head()

Unnamed: 0_level_0,text,author
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id26305,"This process, however, afforded me no means of...",EAP
id17569,It never once occurred to me that the fumbling...,HPL
id11008,"In his left hand was a gold snuff box, from wh...",EAP
id27763,How lovely is spring As we looked from Windsor...,MWS
id12958,"Finding nothing else, not even gold, the Super...",HPL


In [3]:
import re #regex
import nltk
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english')) #makes a set of the stopwords 

#creating a function to encapsulate preprocessing, to make it easy to replicate on  submission data
def processing(df):
    #lowering and removing punctuation
    df['processed'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]','', x.lower()))
    #numerical feature engineering
    #total length of sentence
    df['length'] = df['processed'].apply(lambda x: len(x))
    #get number of words
    df['words'] = df['processed'].apply(lambda x: len(x.split(' '))) #total words
    df['words_not_stopword'] = df['processed'].apply(lambda x: len([t for t in x.split(' ') if t not in stopWords])) #words that arent stopwords
    #get the average word length
    df['avg_word_length'] = df['processed'].apply(lambda x: np.mean([len(t) for t in x.split(' ') if t not in stopWords]) if len([len(t) for t in x.split(' ') if t not in stopWords]) > 0 else 0)
    df['commas'] = df['text'].apply(lambda x: x.count(',')) #count the number of commas in the text
    
    return(df)

df = processing(df)

df.head()

Unnamed: 0_level_0,text,author,processed,length,words,words_not_stopword,avg_word_length,commas
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
id26305,"This process, however, afforded me no means of...",EAP,this process however afforded me no means of a...,224,41,21,6.380952,4
id17569,It never once occurred to me that the fumbling...,HPL,it never once occurred to me that the fumbling...,70,14,6,6.166667,0
id11008,"In his left hand was a gold snuff box, from wh...",EAP,in his left hand was a gold snuff box from whi...,195,36,19,5.947368,4
id27763,How lovely is spring As we looked from Windsor...,MWS,how lovely is spring as we looked from windsor...,202,34,21,6.47619,3
id12958,"Finding nothing else, not even gold, the Super...",HPL,finding nothing else not even gold the superin...,170,27,16,7.1875,2


In [4]:
from sklearn.model_selection import train_test_split

features= [c for c in df.columns.values if c not in ['id','text','author']] #if it is not in the text, author, id columns
numeric_features= [c for c in df.columns.values if c not in ['id','text','author','processed']] #all the numeric features
target = 'author'
#splitting the train set using just the processed and author columns, iterates the whole processed column each time?
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.33, random_state=42) #test will be 1/3 and train will be 2/3
X_train.head()

Unnamed: 0_level_0,processed,length,words,words_not_stopword,avg_word_length,commas
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
id19417,this panorama is indeed glorious and i should ...,91,18,6,6.666667,1
id09522,there was a simple natural earnestness about h...,240,44,18,6.277778,4
id22732,who are you pray that i duc de lomelette princ...,387,74,38,5.552632,9
id10351,he had gone in the carriage to the nearest tow...,118,24,11,5.363636,0
id24580,there is no method in their proceedings beyond...,71,13,5,7.0,1


In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
#BaseEstimator documentation: https://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html
#this is the base for all estimators

#TransformerMixin documentation: https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html
#a mixin is a class that provides methods for other classes
#so the TransformerMixin specifically is a mixin class for all transformers

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
#these class definitions is so we are able to pipeline the columns in our dataframe
    

In [6]:
from sklearn.pipeline import Pipeline #documentation: https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
#pipelines sequentially apply a list transform + give a final estimator
#in this case the transforms are either TextSelector or NumberSelector
from sklearn.feature_extraction.text import TfidfVectorizer

#start creating the first pipeline with the processed text
#since text is... well text, we use the TextSelector for our transformation
text = Pipeline([
                ('selector', TextSelector(key='processed')),
                ('tfidf', TfidfVectorizer( stop_words='english'))
            ])

text.fit_transform(X_train)

<13117x21516 sparse matrix of type '<class 'numpy.float64'>'
	with 148061 stored elements in Compressed Sparse Row format>

In [7]:
from sklearn.preprocessing import StandardScaler #documentation: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
#standardizing the feature(s) -> removes mean and scaling to unit variance
#weird behavior might happen if the feature data doesnt look like a standard normal distribution 

#length is numeric, so use the NumberSelector transformer
length =  Pipeline([
                ('selector', NumberSelector(key='length')),
                ('standard', StandardScaler())
            ])

length.fit_transform(X_train)

array([[-0.50769254],
       [ 0.88000324],
       [ 2.24907223],
       ...,
       [-0.46112557],
       [-0.14447015],
       [-0.39593181]])

In [8]:
#more pipelines to make the features

words =  Pipeline([
                ('selector', NumberSelector(key='words')),
                ('standard', StandardScaler())
            ])
words_not_stopword =  Pipeline([
                ('selector', NumberSelector(key='words_not_stopword')),
                ('standard', StandardScaler())
            ])
avg_word_length =  Pipeline([
                ('selector', NumberSelector(key='avg_word_length')),
                ('standard', StandardScaler())
            ])
commas =  Pipeline([
                ('selector', NumberSelector(key='commas')),
                ('standard', StandardScaler()),
            ])

In [9]:
from sklearn.pipeline import FeatureUnion #documentation: https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.FeatureUnion.html
#FeatureUnion concatenates transformers together
#in this case it can concatenate all of our features

#make all our features all in one variable ==> creates one transformer
feats = FeatureUnion([('text', text), 
                      ('length', length),
                      ('words', words),
                      ('words_not_stopword', words_not_stopword),
                      ('avg_word_length', avg_word_length),
                      ('commas', commas)])

feature_processing = Pipeline([('feats', feats)]) #using pipeline on feats transformer
feature_processing.fit_transform(X_train)

<13117x21521 sparse matrix of type '<class 'numpy.float64'>'
	with 213646 stored elements in Compressed Sparse Row format>

In [10]:
#from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression #replacing the RandomForestClassifier with Logistic Regression
from sklearn.metrics import classification_report #https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html

#using the feats transformer + classifier in the new pipeline
pipeline = Pipeline([
    ('features',feats),
    ('classifier', LogisticRegression(random_state = 42, max_iter=500)), # replaced RandomForestClassifier(random_state = 42)) with the LogisticRegression version
])

pipeline.fit(X_train, y_train)

preds = pipeline.predict(X_test)
print(np.mean(preds == y_test)) #the prediction
print(classification_report(y_test, preds)) #the classification report

0.7808727948003714
              precision    recall  f1-score   support

         EAP       0.74      0.85      0.79      2587
         HPL       0.81      0.74      0.77      1852
         MWS       0.83      0.73      0.78      2023

    accuracy                           0.78      6462
   macro avg       0.79      0.77      0.78      6462
weighted avg       0.79      0.78      0.78      6462



In [11]:
#skipping the CV part

#imports to get the part-of-speech tags and tokenizer
from nltk import pos_tag
from nltk import word_tokenize

#add the pos tags to the words in each text in each row
def add_pos_tags(df):
    df['pos'] = df['processed'].apply(lambda x: nltk.pos_tag(word_tokenize(x)))
    return df

df = add_pos_tags(df)
df.head()

In [12]:
#adding the 3 features: noun, adj, verb
def process_pos(df, pos): 
    full_list = []
    for row in df['pos']:
        #new_list = [] #initially had a list of tuples
        count = 0
        for key, val in row:
            if val == pos: #if the value matches with the given string (e.g. 'JJ', NN', 'VB')
                #new_list.append((key, val)) #put tuple with the word (key) and pos (val)
                count += 1 #increment count
        full_list.append(count) #put whole list of that row
    return full_list #returns list of lengths of the number of whatever pos was specified

#checking if the lengths are all the same
#print(len(df))
#print(len(process_pos(df, 'NN')))
#print(len(process_pos(df, 'JJ')))
#print(len(process_pos(df, 'VB')))

In [13]:
pos_adj = process_pos(df, 'JJ') #JJ is adjective
pos_n = process_pos(df, 'NN') #NN is noun
pos_v = process_pos(df, 'VB') #VB is verb

#making the new columns for our new features
df['adjectives'] = pos_adj
df['nouns'] = pos_n
df['verbs'] = pos_v

df.head() #to check our new dataframe w/ adj, noun, and verb

Unnamed: 0_level_0,text,author,processed,length,words,words_not_stopword,avg_word_length,commas,pos,adjectives,nouns,verbs
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
id26305,"This process, however, afforded me no means of...",EAP,this process however afforded me no means of a...,224,41,21,6.380952,4,"[(this, DT), (process, NN), (however, RB), (af...",2,10,1
id17569,It never once occurred to me that the fumbling...,HPL,it never once occurred to me that the fumbling...,70,14,6,6.166667,0,"[(it, PRP), (never, RB), (once, RB), (occurred...",1,2,1
id11008,"In his left hand was a gold snuff box, from wh...",EAP,in his left hand was a gold snuff box from whi...,195,36,19,5.947368,4,"[(in, IN), (his, PRP$), (left, JJ), (hand, NN)...",4,9,0
id27763,How lovely is spring As we looked from Windsor...,MWS,how lovely is spring as we looked from windsor...,202,34,21,6.47619,3,"[(how, WRB), (lovely, RB), (is, VBZ), (spring,...",5,6,0
id12958,"Finding nothing else, not even gold, the Super...",HPL,finding nothing else not even gold the superin...,170,27,16,7.1875,2,"[(finding, VBG), (nothing, NN), (else, RB), (n...",1,5,1


In [14]:
#copying the previous code to make the test and train sets, but it now includes the adj, noun, and verb columns and excludes the pos column
features= [c for c in df.columns.values if c not in ['id','text','author', 'pos']] #everything else other than the stuff we dont care about (not data)
numeric_features= [c for c in df.columns.values if c not in ['id','text','author','processed', 'pos']] #the features with numeric values
target = 'author'
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.33, random_state=42) #test will be 1/3 and train will be 2/3
X_train.head()

Unnamed: 0_level_0,processed,length,words,words_not_stopword,avg_word_length,commas,adjectives,nouns,verbs
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
id19417,this panorama is indeed glorious and i should ...,91,18,6,6.666667,1,1,4,1
id09522,there was a simple natural earnestness about h...,240,44,18,6.277778,4,6,6,0
id22732,who are you pray that i duc de lomelette princ...,387,74,38,5.552632,9,1,15,4
id10351,he had gone in the carriage to the nearest tow...,118,24,11,5.363636,0,0,6,0
id24580,there is no method in their proceedings beyond...,71,13,5,7.0,1,0,3,0


In [15]:
#text = Pipeline([
                #('selector', TextSelector(key='processed')),
                #('tfidf', TfidfVectorizer( stop_words='english'))
            #])

#copied this line since X_train is different this time
text.fit_transform(X_train)

<13117x21516 sparse matrix of type '<class 'numpy.float64'>'
	with 148061 stored elements in Compressed Sparse Row format>

In [16]:
#making new pipelines for the pos following the previous examples
#used NumberSelector since the values of these columns are all numeric
adjectives =  Pipeline([
                ('selector', NumberSelector(key='adjectives')),
                ('standard', StandardScaler()),
            ])
nouns =  Pipeline([
                ('selector', NumberSelector(key='nouns')),
                ('standard', StandardScaler()),
            ])
verbs =  Pipeline([
                ('selector', NumberSelector(key='verbs')),
                ('standard', StandardScaler()),
            ])

In [17]:
#also basically copy pasted the previous stuff, just added the new features
feats = FeatureUnion([('text', text), 
                      ('length', length),
                      ('words', words),
                      ('words_not_stopword', words_not_stopword),
                      ('avg_word_length', avg_word_length),
                      ('commas', commas),
                      ('adjectives', adjectives),
                      ('nouns', nouns),
                      ('verbs', verbs)])

#using the premade feature + the new pos features
#process the new features
feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train)

<13117x21524 sparse matrix of type '<class 'numpy.float64'>'
	with 252997 stored elements in Compressed Sparse Row format>

In [18]:
#basically copy pasted the same thing as before
#except named each variable with the suffix _pos to differentiate between the previous one
pipeline_pos = Pipeline([
    ('features',feats),
    ('classifier', LogisticRegression(random_state = 42, max_iter=500)),
])

pipeline_pos.fit(X_train, y_train)

preds_pos = pipeline_pos.predict(X_test)
print(np.mean(preds_pos == y_test))
print(classification_report(y_test, preds_pos))

0.7862890745899103
              precision    recall  f1-score   support

         EAP       0.74      0.85      0.79      2587
         HPL       0.81      0.74      0.78      1852
         MWS       0.84      0.74      0.79      2023

    accuracy                           0.79      6462
   macro avg       0.80      0.78      0.79      6462
weighted avg       0.79      0.79      0.79      6462



In [20]:
#After adding the 3 new features ('adjectives', 'nouns', and 'verbs'), the precision did increase, but only by about .006.
#I think it increased because of the addition of new features slightly helped the accuracy.
#more features = more accurate
#Though the increase is small. I think this is because the pos_tags I used for the adjectives, nouns, and verbs were just the basic ones (JJ, NN, VB).
#If I had added the more tags (e.g. if I added the tags JJR (adjective, comparative) and JJS (adjective, superlative)) to the adjective column, the count would probably increase.
#page I looked at for the pos tag abbreviations: https://www.guru99.com/pos-tagging-chunking-nltk.html