###Dataset

Name of the data set: **30Columnists**

Dataset consist of 30 author's 50 articles each in English.

Dataset: [dataset](https://www.kemik.yildiz.edu.tr/veri_kumelerimiz.html)

### Load Dataset

In [1]:
from google.colab import files
uploaded = files.upload()

Saving 30Columnists.zip to 30Columnists.zip


In [None]:
!unzip /content/30Columnists.zip -d /content

In [3]:
# Get authors
import os
PATH = "/content/30Columnists/raw_texts"

authors = []
for author in os.listdir(PATH):
  authors.append(author)

authors.sort(key = lambda a: int(a))
print(authors)

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30']


In [5]:
# Read every article of each author and append to dataset
import re
dataset = []  # [[author, article], [author, article], ...]
for author in authors:
  author_path = os.path.join(PATH, author)
  for article_file in os.listdir(author_path):
    if article_file.endswith(".txt"):
      article_path = os.path.join(author_path, article_file)
      with open(article_path, "rb") as f:
        article = f.read().decode("iso-8859-9")

        article = re.sub(r"[\n\r\f\x96]", "", article)
        dataset.append([author, article])

In [6]:
article_count = {}
for author in authors:
  article_count.update({author: 0})

for row in dataset:
  author = row[0]
  count = article_count[author]
  article_count.update({author: count+1})

for author in authors:
  print(f"{author:>2}\t{article_count[author]}")

 1	50
 2	50
 3	50
 4	50
 5	50
 6	50
 7	50
 8	50
 9	50
10	50
11	50
12	50
13	50
14	50
15	50
16	50
17	50
18	50
19	50
20	50
21	50
22	50
23	50
24	50
25	50
26	50
27	50
28	50
29	50
30	50


In [7]:
import pandas as pd

dataset = pd.DataFrame(dataset, columns=["author", "article"])
print(dataset)

     author                                            article
0         1  LOCAL shopkeepers can rest easy, everything is...
1         1  DOES Scottish Labour leader Wendy really know ...
2         1  IT was this time last year in the run up to th...
3         1  IT was all going so well. The man with the pla...
4         1  SCHADENFREUDE! No, it's not a swear word, but ...
...     ...                                                ...
1495     30  All you'd have to remember is plugging in the ...
1496     30  The income tax is the worst thing the U.S. gov...
1497     30  Americans are waking up this Christmas Day to ...
1498     30  Two cheers for the neighborhood agreement that...
1499     30  It was Thanksgiving Day, but no one at the tab...

[1500 rows x 2 columns]


###Tokenization, Punctuations, Stop Words, Case Folding

> Tokeinizing, removing punctuations and stopword, applying case folding.

In [8]:
# nltk library for tokenizing, punctuations and stopwords
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
from nltk.corpus import stopwords
nltk.download("stopwords")
stop_words = stopwords.words("english")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
print(len(stop_words), stop_words)

179 ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than

In [11]:
from nltk import word_tokenize
import re
# Tokenizing, removing punctuations and stopwords, applying case folding
for idx, row in dataset.iterrows():
  dataset.at[idx, "article"] = [token.lower() for token in word_tokenize(re.sub(r"\.", " . ", row["article"])) if token.isalpha() and (token.lower() not in stop_words)]

In [12]:
print(dataset)

     author                                            article
0         1  [local, shopkeepers, rest, easy, everything, g...
1         1  [scottish, labour, leader, wendy, really, know...
2         1  [time, last, year, run, labour, party, confere...
3         1  [going, well, man, plan, sit, back, wait, mayb...
4         1  [schadenfreude, swear, word, might, well, like...
...     ...                                                ...
1495     30  [remember, plugging, car, night, full, tank, m...
1496     30  [income, tax, worst, thing, u, government, cit...
1497     30  [americans, waking, christmas, day, big, packa...
1498     30  [two, cheers, neighborhood, agreement, bring, ...
1499     30  [thanksgiving, day, one, table, talking, thank...

[1500 rows x 2 columns]


###Train-Test Splits

> Splitting dataset into Train (80%) and Test (20%)

In [13]:
# Train Dataset
train_dataset = dataset.sample(frac=0.8, random_state=1)

In [17]:
train_dataset

Unnamed: 0,author,article
91,2,"[seemed, decided, accepted, almost, predestine..."
75,2,"[lagging, loft, became, hot, issue, holyrood, ..."
1264,26,"[reality, checks, come, different, shapes, pre..."
330,7,"[girls, better, boys, schools, new, report, cl..."
1349,27,"[study, published, ago, online, edition, journ..."
...,...,...
652,14,"[england, scared, slovakia, today, opponents, ..."
70,2,"[captured, many, headlines, election, liberal,..."
610,13,"[hope, springs, eternal, fans, baseball, unite..."
1174,24,"[malignant, lymphomas, two, types, lymphoma, d..."


In [15]:
len(train_dataset)

1200

In [16]:
train_dataset["author"].value_counts()

23    46
2     44
8     44
16    44
7     43
1     43
17    42
6     42
21    41
12    41
19    41
5     41
26    41
11    40
20    40
18    40
30    40
25    40
14    40
9     39
3     39
24    39
4     38
22    38
15    38
29    37
10    36
27    36
13    36
28    31
Name: author, dtype: int64

In [18]:
# Test Dataset
test_dataset = dataset.drop(train_dataset.index)

In [19]:
test_dataset

Unnamed: 0,author,article
15,1,"[second, week, ask, alex, salmond, put, first,..."
20,1,"[liberal, snp, councillors, wakening, idea, to..."
21,1,"[ronnie, reagan, tony, blair, seems, first, mi..."
24,1,"[joke, often, find, written, toilet, walls, pr..."
25,1,"[united, kingdom, need, natural, resources, cr..."
...,...,...
1478,30,"[volkswagen, returning, western, pennsylvania,..."
1485,30,"[let, hope, tribune, co, iceberg, tip, iceberg..."
1495,30,"[remember, plugging, car, night, full, tank, m..."
1498,30,"[two, cheers, neighborhood, agreement, bring, ..."


In [20]:
len(test_dataset)

300

In [21]:
test_dataset["author"].value_counts()

28    19
13    14
10    14
27    14
29    13
22    12
4     12
15    12
24    11
9     11
3     11
11    10
25    10
14    10
18    10
20    10
30    10
12     9
5      9
19     9
21     9
26     9
6      8
17     8
1      7
7      7
2      6
8      6
16     6
23     4
Name: author, dtype: int64

###Build Vocabulary

In [22]:
total_tokens = 0
token_set = set()
for _, tokens in train_dataset["article"].items():
  total_tokens += len(tokens)
  for token in tokens:
    token_set.add(token)

# Add <UNKNOWN> to token set for occurence of words that are not in corpus.
token_set.add("<UNKNOWN>")

In [23]:
print(f"Corpus size: {total_tokens}, Vocabulary size: {len(token_set)}\n")
print("First 10 tokens:")
i = 0
for token in token_set:
  if i >= 10:
    break
  print(token)
  i += 1

Corpus size: 450403, Vocabulary size: 32296

First 10 tokens:
disincentive
wolves
bounding
paterson
speechwriter
baptisms
moaner
powell
excreted
sending


###Vectorization


In [24]:
# term to index mapping
term2idx = {}
for idx, term in enumerate(token_set):
  term2idx.update({term:idx})

In [25]:
print(f"Size: {len(term2idx)}\n\nTerms and Indexes:")
i = 0
for key in term2idx:
  if i >= 10:
    break
  print(f"{key}: {term2idx[key]}")
  i += 1

Size: 32296

Terms and Indexes:
disincentive: 0
wolves: 1
bounding: 2
paterson: 3
speechwriter: 4
baptisms: 5
moaner: 6
powell: 7
excreted: 8
sending: 9


In [26]:
# Document vectors of Train Set
train_vector_df = None
vocab_size = len(token_set)
article_vectors = []
for _, row in train_dataset.iterrows():
  vector = [0] * vocab_size
  for token in row["article"]:
    vector[term2idx[token]] += 1
  article_vectors.append([row["author"], vector])

train_vector_df = pd.DataFrame(article_vectors, columns=["author", "vector"])

###Model Training

In [27]:
import numpy as np

author_vectors = {}
for author in authors:
  author_vectors.update({author:np.array([0] * vocab_size)})

for _, row in train_vector_df.iterrows():
  vector = np.array(row["vector"])
  author_vectors[row["author"]] += vector

In [28]:
# First 35 element of each author's vector
for author in authors:
  print(f"Author {author}: {author_vectors[author][:35]}\n")

Author 1: [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0]

Author 2: [0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0]

Author 3: [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0]

Author 4: [1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 5 0 0 0 0 0 0 0 0 0]

Author 5: [0 2 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 1 0 0 0 0]

Author 6: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

Author 7: [0 0 0 0 0 0 0 2 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 2 1 0 0 0 0 0 1 0 0 0]

Author 8: [0 0 1 0 0 1 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 2 0 0 0 0 1 0 0 0 0]

Author 9: [ 0  0  0  0  0  0  0  0  0  0  0  0  0 21  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0]

Author 10: [0 0 0 0 0 0 0 1 0 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0]

Author 11: [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

Author 12: [0 0 0 2 0 0 1 0 0 5 0 0 0 0 0 0 0 0 0

###Similarity Measure: Cosine

In [30]:
def cosine_sim(vector1, vector2):
  dot_product = np.dot(vector1, vector2)  # dot product of the vectors
  magnitude1 = np.linalg.norm(vector1)    # length of vector1
  magnitude2 = np.linalg.norm(vector2)    # length of vector2

  cosine_similarity = dot_product / (magnitude1 * magnitude2)
  return cosine_similarity

###Testing and Result

In [31]:
# Document vectors of Test Set
test_vector_df = None
vocab_size = len(token_set)
test_article_vectors = []
for _, row in test_dataset.iterrows():
  vector = [0] * vocab_size
  for token in row["article"]:
    idx = term2idx.get(token, term2idx.get("<UNKNOWN>"))
    vector[idx] += 1
  test_article_vectors.append([row["author"], vector])

test_vector_df = pd.DataFrame(test_article_vectors, columns=["author", "vector"])

In [33]:
# test_authors: true authors of articles
test_authors, test_vectors  = test_vector_df.iloc[:,0], test_vector_df.iloc[:,1]
test_authors.value_counts()

28    19
13    14
10    14
27    14
29    13
22    12
4     12
15    12
24    11
9     11
3     11
11    10
25    10
14    10
18    10
20    10
30    10
12     9
5      9
19     9
21     9
26     9
6      8
17     8
1      7
7      7
2      6
8      6
16     6
23     4
Name: author, dtype: int64

In [34]:
# make all predictions
predicted_authors = []  # list of predictions
for vector in test_vectors:
  similarities = []
  for author in author_vectors:
    similarities.append(cosine_sim(np.array(vector), author_vectors[author]))
  max_idx = 0
  max_sim = similarities[0]
  for i in range(1, len(similarities)):
    if similarities[i] > max_sim:
      max_sim = similarities[i]
      max_idx = i
  predicted_authors.append(str(max_idx+1))

In [35]:
correct_predictions = 0
for idx, predicted_author in enumerate(predicted_authors):
  if predicted_author == test_authors[idx]:
    correct_predictions += 1

success_rate = correct_predictions / len(predicted_authors)
print(f"Success rate: {success_rate:.4g}")

Success rate: 0.81


In [42]:
# Uncorrect predictions
print("Author\tPrediction")
for idx, predicted_author in enumerate(predicted_authors):
  if predicted_author != test_authors[idx]:
    print(f"{test_authors[idx]:^6}\t{predicted_author:^10}")

Author	Prediction
  1   	    4     
  1   	    20    
  2   	    3     
  2   	    7     
  3   	    1     
  3   	    2     
  3   	    4     
  4   	    30    
  4   	    3     
  4   	    10    
  4   	    10    
  4   	    1     
  6   	    21    
  6   	    21    
  6   	    21    
  6   	    10    
  6   	    20    
  7   	    20    
  7   	    8     
  8   	    6     
  9   	    11    
  9   	    11    
  9   	    11    
  9   	    11    
  9   	    11    
  10  	    1     
  10  	    5     
  12  	    8     
  13  	    14    
  14  	    12    
  14  	    28    
  14  	    8     
  14  	    26    
  14  	    12    
  15  	    13    
  15  	    12    
  15  	    13    
  16  	    24    
  19  	    18    
  19  	    11    
  24  	    25    
  25  	    24    
  26  	    28    
  26  	    12    
  27  	    24    
  27  	    23    
  27  	    11    
  28  	    22    
  28  	    26    
  28  	    8     
  28  	    22    
  28  	    26    
  28  	    12    
  28  	    8     
  29  	   