In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

df = pd.read_csv('/kaggle/input/authors/train-authors.csv')

df.head()

Unnamed: 0,text,author
0,"She wanted clothes to keep her warm, and food...",dickens
1,"The question now was, who was the man,\nand w...",doyle
2,I therefore\n smoked a great number of t...,doyle
3,I am partial to the modern\nFrench school. \n...,doyle
4,"” She stood smiling, holding up a little slip ...",doyle


In [2]:
df.tail()

Unnamed: 0,text,author
29995,It ain't anything. There ain't no harm in it...,twain
29996,In my\nyouth the monarchs of England had cea...,twain
29997,"Bob Sawyer nodded. \n\n‘So are you, sir,’ sai...",dickens
29998,"He was out on the lawn, in through the window...",doyle
29999,"“Here he is,” said he, sitting down and flatt...",doyle


In [4]:
df.head()

Unnamed: 0,text,author
0,"She wanted clothes to keep her warm, and food...",dickens
1,"The question now was, who was the man,\nand w...",doyle
2,I therefore\n smoked a great number of t...,doyle
3,I am partial to the modern\nFrench school. \n...,doyle
4,"” She stood smiling, holding up a little slip ...",doyle


In [5]:
df['author'].value_counts()

author
defoe      7569
dickens    7493
twain      7478
doyle      7460
Name: count, dtype: int64

In [6]:
df.isna().sum()

text      0
author    0
dtype: int64

**MultinomialNB with TFIDF**

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['author'], test_size=0.2, random_state=42)

In [8]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

In [9]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [10]:


model = MultinomialNB()

model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

In [11]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       defoe       0.88      0.94      0.91      1548
     dickens       0.89      0.83      0.86      1515
       doyle       0.85      0.86      0.86      1443
       twain       0.85      0.84      0.84      1494

    accuracy                           0.87      6000
   macro avg       0.87      0.87      0.87      6000
weighted avg       0.87      0.87      0.87      6000



**XGBOOST with Bigrams**

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

In [13]:
lb = LabelEncoder()
df['author'] = lb.fit_transform(df['author'])


In [14]:

# Step 2: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['author'], test_size=0.2, random_state=42)

In [15]:


vectorizer = CountVectorizer(ngram_range=(1, 2))  
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [16]:


model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train_vectorized, y_train)

In [17]:

from sklearn.metrics import f1_score

y_pred = model.predict(X_test_vectorized)
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the F1 score
print(f'F1 Score: {f1}')


F1 Score: 0.8241606469074798


In [19]:
df_test =  pd.read_csv('/kaggle/input/authorstest/test-authors.csv')
df_test.head()

Unnamed: 0,text,author
0,"Carton,” said the man of business. “Good nig...",dickens
1,"_Is taken, and\nhow_, 154. _Tried, condemned...",defoe
2,Through a cousin who\n works with Gelder...,doyle
3,"\n\nIndeed, nothing was more strange than to s...",defoe
4,\n\nOn the rocks above the present city of Alt...,twain


In [21]:
from sklearn.metrics import f1_score

X_test = vectorizer.transform(df_test['text'])

predicted_authors_numeric = model.predict(X_test)

predicted_authors = lb.inverse_transform(predicted_authors_numeric)

y_test_numeric = lb.transform(df_test['author'])

f1 = f1_score(y_test_numeric, predicted_authors_numeric, average='weighted')

print(f'F1 Score on the test set: {f1}')


F1 Score on the test set: 0.8264101669285827


In [22]:
import pandas as pd

predictions_df = pd.DataFrame({
    'author': predicted_authors  
})


predictions_df.to_csv('predicted_authors.csv', index=False)
