In [1]:
import pandas as pd

authors_df = pd.read_csv('Authors_dataset.csv')
authors_df.head(10)

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL
5,id22965,"A youth passed in solitude, my best years spen...",MWS
6,id09674,"The astronomer, perhaps, at this point, took r...",EAP
7,id13515,The surcingle hung in ribands from my body.,EAP
8,id19322,I knew that you could not say to yourself 'ste...,EAP
9,id00912,I confess that neither the structure of langua...,MWS


In [2]:
from nltk.tokenize import word_tokenize

y = authors_df['author']
authors_df.drop(columns='author', inplace=True)

In [3]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(authors_df,y,test_size = 0.2, random_state=42)

In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer

tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocesing(text):
    final_tokens = ' '
    tokens = tokenizer.tokenize(text)
    
    pure_tokens = [token.lower() for token in tokens if token.lower() not in stopwords.words('english')]
    stemmed_tokens = [stemmer.stem(pure_token) for pure_token in pure_tokens]
    final_tokens = final_tokens.join(stemmed_tokens)
    
    return final_tokens

x_train['final_text'] = x_train['text'].apply(preprocesing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [5]:
x_train['final_text']

5107                             must spoken peculiar hair
5465     art life discoveri scienc augment ratio left c...
15443    idri name casual frown convuls gestur anger sh...
9737     time soon come grief famin alreadi sap foundat...
10192    great stone citi r lyeh monolith sepulchr sunk...
                               ...                        
11284    let go hold upon rod place feet secur wall spr...
11964    name john raymond legrass profess inspector polic
5390     manner wyatt receiv harmless pleasantri convin...
860      first assur boundless confid must consciou sin...
15795    thu abundantli clear gang quit barrièr du roul...
Name: final_text, Length: 15663, dtype: object

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', use_idf=True)

vectorizer.fit(x_train['final_text'])
x_test['final_text'] = x_test['text'].apply(preprocesing)
x_train_Tfidf = vectorizer.transform(x_train['final_text'])
x_test_Tfidf = vectorizer.transform(x_test['final_text'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [10]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(x_train_Tfidf.toarray(), y_train)

y_pred = clf.predict(x_test_Tfidf.toarray())
y_pred

array(['EAP', 'MWS', 'EAP', ..., 'HPL', 'HPL', 'EAP'], dtype='<U3')

In [11]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[1360,   75,  135],
       [ 203,  781,   87],
       [ 198,   41, 1036]], dtype=int64)

In [13]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         EAP       0.77      0.87      0.82      1570
         HPL       0.87      0.73      0.79      1071
         MWS       0.82      0.81      0.82      1275

    accuracy                           0.81      3916
   macro avg       0.82      0.80      0.81      3916
weighted avg       0.82      0.81      0.81      3916

