In [552]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import nltk
import plotly.graph_objs as go
import plotly.offline as py


In [553]:
df = pd.read_csv("C:\\Users\\Shaima\\Documents\\GitHub\\AuthorIdentification\\Dataset\\train.csv", sep=',', quotechar='"')

In [554]:
df.head()


Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [555]:
df.shape

(19579, 3)

In [556]:
df.isnull().sum()

id        0
text      0
author    0
dtype: int64

In [557]:
check_duplicates = df.duplicated().any()
print(check_duplicates)

False


In [558]:
df = df.drop_duplicates() #drop if there is duplicate values

In [559]:
df.describe()

Unnamed: 0,id,text,author
count,19579,19579,19579
unique,19579,19579,3
top,id26305,"This process, however, afforded me no means of...",EAP
freq,1,1,7900


In [560]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19579 entries, 0 to 19578
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      19579 non-null  object
 1   text    19579 non-null  object
 2   author  19579 non-null  object
dtypes: object(3)
memory usage: 459.0+ KB


In [561]:
z = {'EAP': 'Edgar Allen Poe', 'MWS': 'Mary Shelley', 'HPL': 'HP Lovecraft'}
data = [go.Bar(x = df.author.map(z).unique(),
               y = df.author.value_counts().values,
               marker = dict(colorscale='Jet',
                            color = df.author.value_counts().values),
               text = 'Text entries attributed to Author')]

layout = go.Layout(title='Target variable distribution')

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='FrequenciesOfAuthors')

In [562]:
first_text = df.text.values[0]
print(first_text)
print('='*90)
print(first_text.split(' '))

This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.
['This', 'process,', 'however,', 'afforded', 'me', 'no', 'means', 'of', 'ascertaining', 'the', 'dimensions', 'of', 'my', 'dungeon;', 'as', 'I', 'might', 'make', 'its', 'circuit,', 'and', 'return', 'to', 'the', 'point', 'whence', 'I', 'set', 'out,', 'without', 'being', 'aware', 'of', 'the', 'fact;', 'so', 'perfectly', 'uniform', 'seemed', 'the', 'wall.']


In [563]:
 #The punctuation mark (comma) has also been included and is being treated along with the word "process" as a term in itself so, we will use the function of natural language
 #processing

In [564]:
import nltk
nltk.download('punkt')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shaima\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [565]:
first_text_list = nltk.word_tokenize(first_text)
print(first_text_list)

['This', 'process', ',', 'however', ',', 'afforded', 'me', 'no', 'means', 'of', 'ascertaining', 'the', 'dimensions', 'of', 'my', 'dungeon', ';', 'as', 'I', 'might', 'make', 'its', 'circuit', ',', 'and', 'return', 'to', 'the', 'point', 'whence', 'I', 'set', 'out', ',', 'without', 'being', 'aware', 'of', 'the', 'fact', ';', 'so', 'perfectly', 'uniform', 'seemed', 'the', 'wall', '.']


In [566]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english') #stopwords don't affect our learning
len(stopwords)
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shaima\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [567]:
first_text_list_cleaned = [word for word in first_text_list if word.lower() not in stopwords]
print(first_text_list_cleaned)
print('='*90)
print('Length of the original list: {0} words\n'
     'Length of the list after stopwords removal: {1} words'
     .format(len(first_text_list), len(first_text_list_cleaned)))

['process', ',', 'however', ',', 'afforded', 'means', 'ascertaining', 'dimensions', 'dungeon', ';', 'might', 'make', 'circuit', ',', 'return', 'point', 'whence', 'set', ',', 'without', 'aware', 'fact', ';', 'perfectly', 'uniform', 'seemed', 'wall', '.']
Length of the original list: 48 words
Length of the list after stopwords removal: 28 words


In [568]:

#X= df['text']
Y= df['author']

In [569]:
import re
df['text']= df['text'].apply(lambda x: x.lower())
df['text']= df['text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]','',x))

In [570]:
#stemmer = nltk.stem.PorterStemmer()
#sno = nltk.stem.SnowballStemmer('english')
nltk.download('wordnet')
lemma = nltk.wordnet.WordNetLemmatizer()



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Shaima\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [571]:
df.head()

Unnamed: 0,id,text,author
0,id26305,this process however afforded me no means of a...,EAP
1,id17569,it never once occurred to me that the fumbling...,HPL
2,id11008,in his left hand was a gold snuff box from whi...,EAP
3,id27763,how lovely is spring as we looked from windsor...,MWS
4,id12958,finding nothing else not even gold the superin...,HPL


In [572]:
df['text'] = df['text'].str.split()
df['text']= df['text'].apply(lambda x: [lemma.lemmatize(y,pos='v') for y in x])

In [573]:
df.head()

Unnamed: 0,id,text,author
0,id26305,"[this, process, however, afford, me, no, mean,...",EAP
1,id17569,"[it, never, once, occur, to, me, that, the, fu...",HPL
2,id11008,"[in, his, leave, hand, be, a, gold, snuff, box...",EAP
3,id27763,"[how, lovely, be, spring, as, we, look, from, ...",MWS
4,id12958,"[find, nothing, else, not, even, gold, the, su...",HPL


In [574]:
df['text'] = [' '.join(map(str, l)) for l in df['text']]
X = df['text']


In [575]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=200,min_df=0.0)
X = vectorizer.fit_transform(X).toarray()
X
 

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [576]:
print(X[19000])

[0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 4 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [577]:
features = vectorizer.get_feature_names_out()
df_post_processing = pd.DataFrame(X,columns=features).head()
df_post_processing.head()

Unnamed: 0,about,after,again,all,among,an,and,any,appear,as,...,with,within,without,word,world,would,years,yet,you,your
0,0,0,0,0,0,0,1,0,0,1,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,1,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,2,0,0,2,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [578]:
df.head()

Unnamed: 0,id,text,author
0,id26305,this process however afford me no mean of asce...,EAP
1,id17569,it never once occur to me that the fumble migh...,HPL
2,id11008,in his leave hand be a gold snuff box from whi...,EAP
3,id27763,how lovely be spring as we look from windsor t...,MWS
4,id12958,find nothing else not even gold the superinten...,HPL
