###Dataset

Name of the data set: **30Columnists**

Dataset consist of 30 author's 50 articles each in English.

Dataset: [dataset](http://www.kemik.yildiz.edu.tr/veri_kumelerimiz.html)

### Load Dataset

In [1]:
!curl http://www.kemik.yildiz.edu.tr/data/File/30Columnists.zip -o ./30Columnists.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 7208k  100 7208k    0     0   364k      0  0:00:19  0:00:19 --:--:--  424k


In [None]:
!unzip /content/30Columnists.zip -d /content

In [3]:
# Get authors
import os
PATH = "/content/30Columnists/raw_texts"

authors = []
for author in os.listdir(PATH):
  authors.append(author)

authors.sort(key = lambda a: int(a))
print(authors)

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30']


In [4]:
# Read every article of each author and append to dataset
import re
dataset = []  # [[author, article], [author, article], ...]
for author in authors:
  author_path = os.path.join(PATH, author)
  for article_file in os.listdir(author_path):
    if article_file.endswith(".txt"):
      article_path = os.path.join(author_path, article_file)
      with open(article_path, "rb") as f:
        article = f.read().decode("iso-8859-9")

        article = re.sub(r"[\n\r\f\x96]", "", article)
        dataset.append([author, article])

In [5]:
article_count = {}
for author in authors:
  article_count.update({author: 0})

for row in dataset:
  author = row[0]
  count = article_count[author]
  article_count.update({author: count+1})

for author in authors:
  print(f"{author:>2}\t{article_count[author]}")

 1	50
 2	50
 3	50
 4	50
 5	50
 6	50
 7	50
 8	50
 9	50
10	50
11	50
12	50
13	50
14	50
15	50
16	50
17	50
18	50
19	50
20	50
21	50
22	50
23	50
24	50
25	50
26	50
27	50
28	50
29	50
30	50


In [6]:
import pandas as pd

dataset = pd.DataFrame(dataset, columns=["author", "article"])
print(dataset)

     author                                            article
0         1  THE British Motor Show opened this week with t...
1         1  THE small assembly of rather embarrassed anony...
2         1  LOCAL shopkeepers can rest easy, everything is...
3         1   DON'T think it's ever happened to me before, ...
4         1  WHAT good news it was this week to see that af...
...     ...                                                ...
1495     30  There is something about $4 gasoline that look...
1496     30  Have you noticed? Gasoline is under $4 a gallo...
1497     30  Every year about this time a man gets a remind...
1498     30  It was mainly a social occasion. An elegant bl...
1499     30  Economic suicide bombing.What else can you cal...

[1500 rows x 2 columns]


###Tokenization, Punctuations, Stop Words, Case Folding

> Tokeinizing, removing punctuations and stopword, applying case folding.

In [9]:
# nltk library for tokenizing, punctuations and stopwords
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
from nltk.corpus import stopwords
nltk.download("stopwords")
stop_words = stopwords.words("english")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
print(len(stop_words), stop_words)

179 ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than

In [12]:
from nltk import word_tokenize
import re
# Tokenizing, removing punctuations and stopwords, applying case folding
for idx, row in dataset.iterrows():
  dataset.at[idx, "article"] = [token.lower() for token in word_tokenize(re.sub(r"\.", " . ", row["article"])) if token.isalpha() and (token.lower() not in stop_words)]

In [13]:
print(dataset)

     author                                            article
0         1  [british, motor, show, opened, week, prime, mi...
1         1  [small, assembly, rather, embarrassed, anonymo...
2         1  [local, shopkeepers, rest, easy, everything, g...
3         1  [think, ever, happened, little, ashamed, admit...
4         1  [good, news, week, see, deployment, police, of...
...     ...                                                ...
1495     30  [something, gasoline, looks, going, away, serv...
1496     30  [noticed, gasoline, gallon, supposed, happen, ...
1497     30  [every, year, time, man, gets, reminder, best,...
1498     30  [mainly, social, occasion, elegant, black, tie...
1499     30  [economic, suicide, bombing, else, call, launc...

[1500 rows x 2 columns]


###Train-Test Splits

> Splitting dataset into Train (80%) and Test (20%)

In [14]:
# Train Dataset
train_dataset = dataset.sample(frac=0.8, random_state=1)

In [15]:
train_dataset

Unnamed: 0,author,article
91,2,"[americans, queued, round, block, hours, cast,..."
75,2,"[politicians, poring, maps, checking, election..."
1264,26,"[maybe, glass, rioja, champions, league, tie, ..."
330,7,"[nothing, like, sex, scandal, political, class..."
1349,27,"[reasons, dry, man, really, grasp, many, peopl..."
...,...,...
652,14,"[oh, dear, quite, sure, came, full, two, days,..."
70,2,"[america, fire, chiefs, sheriffs, well, govern..."
610,13,"[baseball, regular, season, sundae, cherry, to..."
1174,24,"[digital, age, noticed, many, simple, items, p..."


In [16]:
len(train_dataset)

1200

In [17]:
train_dataset["author"].value_counts()

23    46
2     44
8     44
16    44
7     43
1     43
17    42
6     42
21    41
12    41
19    41
5     41
26    41
11    40
20    40
18    40
30    40
25    40
14    40
9     39
3     39
24    39
4     38
22    38
15    38
29    37
10    36
27    36
13    36
28    31
Name: author, dtype: int64

In [18]:
# Test Dataset
test_dataset = dataset.drop(train_dataset.index)

In [19]:
test_dataset

Unnamed: 0,author,article
15,1,"[slow, relentless, creep, towards, society, in..."
20,1,"[name, andrew, j, volstead, become, one, influ..."
21,1,"[ronald, reagan, said, nine, terrifying, words..."
24,1,"[week, scottish, charities, regulator, decided..."
25,1,"[meant, flagship, policy, designed, show, snp,..."
...,...,...
1478,30,"[tend, wish, john, f, ferguson, got, little, l..."
1485,30,"[uneasy, feeling, grows, hustled, recession, y..."
1495,30,"[something, gasoline, looks, going, away, serv..."
1498,30,"[mainly, social, occasion, elegant, black, tie..."


In [20]:
len(test_dataset)

300

In [21]:
test_dataset["author"].value_counts()

28    19
13    14
10    14
27    14
29    13
22    12
4     12
15    12
24    11
9     11
3     11
11    10
25    10
14    10
18    10
20    10
30    10
12     9
5      9
19     9
21     9
26     9
6      8
17     8
1      7
7      7
2      6
8      6
16     6
23     4
Name: author, dtype: int64

###Build Vocabulary

In [22]:
total_tokens = 0
token_set = set()
for _, tokens in train_dataset["article"].items():
  total_tokens += len(tokens)
  for token in tokens:
    token_set.add(token)

# Add <UNKNOWN> to token set for occurence of words that are not in corpus.
token_set.add("<UNKNOWN>")

In [23]:
print(f"Corpus size: {total_tokens}, Vocabulary size: {len(token_set)}\n")
print("First 10 tokens:")
i = 0
for token in token_set:
  if i >= 10:
    break
  print(token)
  i += 1

Corpus size: 453506, Vocabulary size: 32328

First 10 tokens:
beans
taller
tut
genetics
stoppard
crewe
harding
circumstantial
callidus
nieces


###Vectorization


In [24]:
# term to index mapping
term2idx = {}
for idx, term in enumerate(token_set):
  term2idx.update({term:idx})

In [25]:
print(f"Size: {len(term2idx)}\n\nTerms and Indexes:")
i = 0
for key in term2idx:
  if i >= 10:
    break
  print(f"{key}: {term2idx[key]}")
  i += 1

Size: 32328

Terms and Indexes:
beans: 0
taller: 1
tut: 2
genetics: 3
stoppard: 4
crewe: 5
harding: 6
circumstantial: 7
callidus: 8
nieces: 9


In [26]:
# Document vectors of Train Set
train_vector_df = None
vocab_size = len(token_set)
article_vectors = []
for _, row in train_dataset.iterrows():
  vector = [0] * vocab_size
  for token in row["article"]:
    vector[term2idx[token]] += 1
  article_vectors.append([row["author"], vector])

train_vector_df = pd.DataFrame(article_vectors, columns=["author", "vector"])

###Model Training

In [27]:
import numpy as np

author_vectors = {}
for author in authors:
  author_vectors.update({author:np.array([0] * vocab_size)})

for _, row in train_vector_df.iterrows():
  vector = np.array(row["vector"])
  author_vectors[row["author"]] += vector

In [28]:
# First 35 element of each author's vector
for author in authors:
  print(f"Author {author}: {author_vectors[author][:35]}\n")

Author 1: [0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0]

Author 2: [0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0]

Author 3: [1 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 4 0 0 0 1 0 0 0 0]

Author 4: [0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1]

Author 5: [0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 1]

Author 6: [0 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1]

Author 7: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]

Author 8: [0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0]

Author 9: [0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 2]

Author 10: [0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0]

Author 11: [ 0  2  0  1  0  0  0  0  0  1  1  0  0  0  0  0  0  0  0  0  0  1  0  1
  0  0  0  0  0  0  0  0  0  0 13]

Author 12: [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0

###Similarity Measure: Cosine

In [29]:
def cosine_sim(vector1, vector2):
  dot_product = np.dot(vector1, vector2)  # dot product of the vectors
  magnitude1 = np.linalg.norm(vector1)    # length of vector1
  magnitude2 = np.linalg.norm(vector2)    # length of vector2

  cosine_similarity = dot_product / (magnitude1 * magnitude2)
  return cosine_similarity

###Testing and Result

In [30]:
# Document vectors of Test Set
test_vector_df = None
vocab_size = len(token_set)
test_article_vectors = []
for _, row in test_dataset.iterrows():
  vector = [0] * vocab_size
  for token in row["article"]:
    idx = term2idx.get(token, term2idx.get("<UNKNOWN>"))
    vector[idx] += 1
  test_article_vectors.append([row["author"], vector])

test_vector_df = pd.DataFrame(test_article_vectors, columns=["author", "vector"])

In [31]:
# test_authors: true authors of articles
test_authors, test_vectors  = test_vector_df.iloc[:,0], test_vector_df.iloc[:,1]
test_authors.value_counts()

28    19
13    14
10    14
27    14
29    13
22    12
4     12
15    12
24    11
9     11
3     11
11    10
25    10
14    10
18    10
20    10
30    10
12     9
5      9
19     9
21     9
26     9
6      8
17     8
1      7
7      7
2      6
8      6
16     6
23     4
Name: author, dtype: int64

In [32]:
# make all predictions
predicted_authors = []  # list of predictions
for vector in test_vectors:
  similarities = []
  for author in author_vectors:
    similarities.append(cosine_sim(np.array(vector), author_vectors[author]))
  max_idx = 0
  max_sim = similarities[0]
  for i in range(1, len(similarities)):
    if similarities[i] > max_sim:
      max_sim = similarities[i]
      max_idx = i
  predicted_authors.append(str(max_idx+1))

In [33]:
correct_predictions = 0
for idx, predicted_author in enumerate(predicted_authors):
  if predicted_author == test_authors[idx]:
    correct_predictions += 1

success_rate = correct_predictions / len(predicted_authors)
print(f"Success rate: {success_rate:.4g}")

Success rate: 0.79


In [34]:
# Uncorrect predictions
print("Author\tPrediction")
for idx, predicted_author in enumerate(predicted_authors):
  if predicted_author != test_authors[idx]:
    print(f"{test_authors[idx]:^6}\t{predicted_author:^10}")

Author	Prediction
  1   	    5     
  1   	    4     
  2   	    3     
  3   	    18    
  3   	    4     
  3   	    1     
  4   	    7     
  4   	    7     
  4   	    2     
  4   	    1     
  4   	    2     
  4   	    10    
  4   	    10    
  6   	    7     
  6   	    2     
  6   	    3     
  6   	    10    
  6   	    10    
  6   	    3     
  7   	    20    
  7   	    10    
  8   	    22    
  9   	    24    
  9   	    11    
  9   	    27    
  9   	    11    
  10  	    5     
  11  	    16    
  11  	    1     
  11  	    9     
  12  	    8     
  12  	    26    
  12  	    8     
  14  	    12    
  14  	    8     
  15  	    13    
  16  	    25    
  16  	    24    
  17  	    7     
  18  	    19    
  18  	    17    
  19  	    20    
  19  	    18    
  21  	    19    
  21  	    6     
  21  	    6     
  22  	    14    
  24  	    25    
  27  	    24    
  27  	    11    
  27  	    23    
  28  	    26    
  28  	    22    
  28  	    8     
  28  	   