<a href="https://colab.research.google.com/github/devil-of-silicon-valley/cmpe255-project/blob/shabab/Baseline/Program_Baseline_Shabab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Group 5:  Reuter_50_50 Data Set (https://archive.ics.uci.edu/ml/datasets/Reuter_50_50Links to an external site.)

Identify the author of an article based on attributes describing their writing style

**GOAL: Identify the author of an article based on attributes describing their writing style**

In [1]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [135]:
# import libaries
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.metrics import f1_score
from glob import glob
import json

In [3]:
# setup debug for prints troubleshooting
debug = True

In [4]:
def load_json(json_file):
  with open(json_file) as f:
    data = json.load(f)
  return data

def write_json(json_file, data):
  with open(json_file, "w") as outfile:
    json.dump(data, outfile)

## Initial Dataset Construction

The following code was used to parse the raw dataset to create training and testing JSONs that will be used for classification models.

These resulting JSONs will also make it faster to load the dataset.

In [5]:
# get the train and set directories
train_dir = '/content/drive/MyDrive/Assignments/03: FA24: CMPE-255 Sec 33 - Data Mining/Project/C50train'
test_dir = '/content/drive/MyDrive/Assignments/03: FA24: CMPE-255 Sec 33 - Data Mining/Project/C50test'

In [73]:
# create a function to load the data from the directories
def parse_raw_data(dir, json_save_dir, debug=False):

  print(f"Creating JSON from data in {dir}")

  # initialize empty dictionary to fill with author information
  data_dict = {}

  # get the list of author directories
  author_dirs = sorted(glob(f"{dir}/*"))
  print(f"Found {len(author_dirs)} authors...")

  # now parse the directories for the author names and their associated texts
  for author_dir in author_dirs:

    # get the author name from the directory
    author_name = author_dir.split('/')[-1]

    if debug: print(f"Loading files for {author_name}")

    total_files = len(glob(f"{author_dir}/*.txt"))
    if debug: print(f"Found {total_files} files for {author_name}")

    author_file_entry = {}
    for file_entry in sorted(glob(f"{author_dir}/*.txt")):
      # get te filename (ex: 2537newsML.txt)
      author_filename = file_entry.split('/')[-1]
      # Read the file and remove any new lines or raw strings
      author_file = open(file_entry, "r").read().replace('\n', '').replace('\r', '')
      # create the dictionary accordingly
      author_file_entry[author_filename] = author_file
      # example of an entry "Author1" : {"1234text.txt" : "Text for this entry"}

    if debug: print(f"Adding {author_name} to data dictionary")

    # add the dictionary entry for the current author and their files
    data_dict[author_name] = author_file_entry

  write_json(json_save_dir, data_dict)

  return data_dict

In [120]:
train_json_save_dir = '/content/drive/MyDrive/Assignments/03: FA24: CMPE-255 Sec 33 - Data Mining/Project/C50train.json'
test_json_save_dir = '/content/drive/MyDrive/Assignments/03: FA24: CMPE-255 Sec 33 - Data Mining/Project/C50test.json'
train_json = parse_raw_data(dir=train_dir, json_save_dir=train_json_save_dir)
test_json = parse_raw_data(dir=test_dir, json_save_dir=test_json_save_dir)

Creating JSON from data in /content/drive/MyDrive/Assignments/03: FA24: CMPE-255 Sec 33 - Data Mining/Project/C50train
Found 50 authors...
Creating JSON from data in /content/drive/MyDrive/Assignments/03: FA24: CMPE-255 Sec 33 - Data Mining/Project/C50test
Found 50 authors...


## Loading the JSON to make a Dataframe

In [121]:
# read the jsons
train_json_save_dir = '/content/drive/MyDrive/Assignments/03: FA24: CMPE-255 Sec 33 - Data Mining/Project/C50train.json'
test_json_save_dir = '/content/drive/MyDrive/Assignments/03: FA24: CMPE-255 Sec 33 - Data Mining/Project/C50test.json'
train_json = load_json(train_json_save_dir)
test_json = load_json(test_json_save_dir)

Let's try making a dataframe from the JSON we just loaded

In [122]:
train_data_df = pd.DataFrame(train_json, index=None).T
test_data_df = pd.DataFrame(test_json, index=None).T

In [123]:
train_data_df.head()

Unnamed: 0,106247newsML.txt,120600newsML.txt,120683newsML.txt,136958newsML.txt,137498newsML.txt,14014newsML.txt,156814newsML.txt,182596newsML.txt,186392newsML.txt,193495newsML.txt,...,258689newsML.txt,264132newsML.txt,268647newsML.txt,278687newsML.txt,281216newsML.txt,28223newsML.txt,282935newsML.txt,287736newsML.txt,289747newsML.txt,304402newsML.txt
AaronPressman,The Internet may be overflowing with new techn...,The U.S. Postal Service announced Wednesday a ...,Elementary school students with access to the ...,An influential Internet organisation has backe...,An influential Internet organisation has backe...,A group of leading trademark specialists plans...,When a company in California sells a book to a...,U.S. laws governing the trillion dollar future...,Supreme Court justices Wednesday sharply quest...,The Internet continued to grow in leaps and bo...,...,,,,,,,,,,
AlanCrosby,,,,,,,,,,,...,,,,,,,,,,
AlexanderSmith,,,,,,,,,,,...,,,,,,,,,,
BenjaminKangLim,,,,,,,,,,,...,,,,,,,,,,
BernardHickey,,,,,,,,,,,...,,,,,,,,,,


This isnt't very useful because of the NaNs.

Refactor the JSON so it's a cleaner more usable format

In [124]:
# this function will refactor the data dictionary to a format
# that is more friendly for dataframes
def refactor_data(data_dict):

  # create the dictionary that will be converted to a dataframe
  refactored_data_dict = {}

  # init list of unique authors
  authors = []

  # init raw labels (authors) and data (text)
  # that will be used for actual training
  labels = []
  data = []

  # parse the dictionary to get the files for each author
  for author_entry, file_entry in data_dict.items():

    # init the text list for each author
    text_list = []

    # add the current author to the list of total authors
    authors.append(author_entry)

    # parse the text list to add to the relevant lists
    for file_name, file_text in file_entry.items():
      labels.append(author_entry)
      data.append(file_text)

    refactored_data_dict[author_entry] = data

  data_df = pd.DataFrame(refactored_data_dict, index=None).T

  return data_df, authors, labels, data

In [125]:
train_data_df, train_authors, train_labels, train_data = refactor_data(train_json)
test_data_df, test_authors, test_labels, test_data = refactor_data(test_json)

In [126]:
train_data_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
AaronPressman,The Internet may be overflowing with new techn...,The U.S. Postal Service announced Wednesday a ...,Elementary school students with access to the ...,An influential Internet organisation has backe...,An influential Internet organisation has backe...,A group of leading trademark specialists plans...,When a company in California sells a book to a...,U.S. laws governing the trillion dollar future...,Supreme Court justices Wednesday sharply quest...,The Internet continued to grow in leaps and bo...,...,China has taken its cue from U.S. Federal Rese...,"The Stone Group, a Chinese high technology com...",China said on Thursday it strongly opposed a v...,A top Chinese defence official has stepped dow...,China warned on Monday against reinforcing mil...,China's central bank chief has said that infla...,"China ushered in 1997, a year it has hailed as...",China issued tough new rules on the handling o...,China will avoid bold moves in tackling its ai...,Communist Party chief Jiang Zemin has put his ...
AlanCrosby,The Internet may be overflowing with new techn...,The U.S. Postal Service announced Wednesday a ...,Elementary school students with access to the ...,An influential Internet organisation has backe...,An influential Internet organisation has backe...,A group of leading trademark specialists plans...,When a company in California sells a book to a...,U.S. laws governing the trillion dollar future...,Supreme Court justices Wednesday sharply quest...,The Internet continued to grow in leaps and bo...,...,China has taken its cue from U.S. Federal Rese...,"The Stone Group, a Chinese high technology com...",China said on Thursday it strongly opposed a v...,A top Chinese defence official has stepped dow...,China warned on Monday against reinforcing mil...,China's central bank chief has said that infla...,"China ushered in 1997, a year it has hailed as...",China issued tough new rules on the handling o...,China will avoid bold moves in tackling its ai...,Communist Party chief Jiang Zemin has put his ...
AlexanderSmith,The Internet may be overflowing with new techn...,The U.S. Postal Service announced Wednesday a ...,Elementary school students with access to the ...,An influential Internet organisation has backe...,An influential Internet organisation has backe...,A group of leading trademark specialists plans...,When a company in California sells a book to a...,U.S. laws governing the trillion dollar future...,Supreme Court justices Wednesday sharply quest...,The Internet continued to grow in leaps and bo...,...,China has taken its cue from U.S. Federal Rese...,"The Stone Group, a Chinese high technology com...",China said on Thursday it strongly opposed a v...,A top Chinese defence official has stepped dow...,China warned on Monday against reinforcing mil...,China's central bank chief has said that infla...,"China ushered in 1997, a year it has hailed as...",China issued tough new rules on the handling o...,China will avoid bold moves in tackling its ai...,Communist Party chief Jiang Zemin has put his ...
BenjaminKangLim,The Internet may be overflowing with new techn...,The U.S. Postal Service announced Wednesday a ...,Elementary school students with access to the ...,An influential Internet organisation has backe...,An influential Internet organisation has backe...,A group of leading trademark specialists plans...,When a company in California sells a book to a...,U.S. laws governing the trillion dollar future...,Supreme Court justices Wednesday sharply quest...,The Internet continued to grow in leaps and bo...,...,China has taken its cue from U.S. Federal Rese...,"The Stone Group, a Chinese high technology com...",China said on Thursday it strongly opposed a v...,A top Chinese defence official has stepped dow...,China warned on Monday against reinforcing mil...,China's central bank chief has said that infla...,"China ushered in 1997, a year it has hailed as...",China issued tough new rules on the handling o...,China will avoid bold moves in tackling its ai...,Communist Party chief Jiang Zemin has put his ...
BernardHickey,The Internet may be overflowing with new techn...,The U.S. Postal Service announced Wednesday a ...,Elementary school students with access to the ...,An influential Internet organisation has backe...,An influential Internet organisation has backe...,A group of leading trademark specialists plans...,When a company in California sells a book to a...,U.S. laws governing the trillion dollar future...,Supreme Court justices Wednesday sharply quest...,The Internet continued to grow in leaps and bo...,...,China has taken its cue from U.S. Federal Rese...,"The Stone Group, a Chinese high technology com...",China said on Thursday it strongly opposed a v...,A top Chinese defence official has stepped dow...,China warned on Monday against reinforcing mil...,China's central bank chief has said that infla...,"China ushered in 1997, a year it has hailed as...",China issued tough new rules on the handling o...,China will avoid bold moves in tackling its ai...,Communist Party chief Jiang Zemin has put his ...


This looks a lot better. Maybe this can be used to create a classification model.

## Experimenting with Classification Models

In [157]:
# read the jsons
train_json_save_dir = '/content/drive/MyDrive/Assignments/03: FA24: CMPE-255 Sec 33 - Data Mining/Project/C50train.json'
test_json_save_dir = '/content/drive/MyDrive/Assignments/03: FA24: CMPE-255 Sec 33 - Data Mining/Project/C50test.json'
train_json = load_json(train_json_save_dir)
test_json = load_json(test_json_save_dir)

# this function will refactor the data dictionary to a format
# that is more friendly for dataframes
def refactor_data(data_dict):
  # create the dictionary that will be converted to a dataframe
  refactored_data_dict = {}
  # init list of unique authors
  authors = []
  # init raw labels (authors) and data (text)
  # that will be used for actual training
  labels = []
  data = []
  # parse the dictionary to get the files for each author
  for author_entry, file_entry in data_dict.items():
    # init the text list for each author
    text_list = []
    # add the current author to the list of total authors
    authors.append(author_entry)
    # parse the text list to add to the relevant lists
    for file_name, file_text in file_entry.items():
      labels.append(author_entry)
      data.append(file_text)
    refactored_data_dict[author_entry] = data
  data_df = pd.DataFrame(refactored_data_dict, index=None).T
  return data_df, authors, labels, data

train_data_df, train_authors, train_labels, train_data = refactor_data(train_json)
test_data_df, test_authors, test_labels, test_data = refactor_data(test_json)

In [158]:
train_labels_np = np.array(train_labels)
train_data_np = np.array(train_data)
test_labels_np = np.array(test_labels)
test_data_np = np.array(test_data)

In [145]:
# split the test data
X_train, X_test, y_train, y_test = train_test_split(train_data_np, train_labels_np, test_size=0.3, random_state=1)

In [146]:
# vectorize x_train and x_test from text to matrix
vect = TfidfVectorizer(stop_words='english')

# vectorize the training data
x_train_vec = vect.fit_transform(X_train)
x_test_vec = vect.transform(X_test)

# vectorize the actual test set
test_data_vec = vect.transform(test_data)

In [147]:
model = RandomForestClassifier()
model.fit(x_train_vec,y_train)

In [148]:
predictions = model.predict(x_test_vec)
f1 = f1_score(y_test, predictions, average='weighted')
print(f1)

0.7682470645833175


In [153]:
predictions = model.predict(test_data_vec)
f1 = f1_score(test_labels_np, predictions, average='weighted')
print(f1)

0.6119569832688249


In [156]:
rep = classification_report(test_labels_np, predictions)
print(rep)

                   precision    recall  f1-score   support

    AaronPressman       0.70      0.94      0.80        50
       AlanCrosby       0.90      0.52      0.66        50
   AlexanderSmith       0.62      0.50      0.56        50
  BenjaminKangLim       0.28      0.42      0.34        50
    BernardHickey       0.50      0.20      0.29        50
      BradDorfman       0.66      0.66      0.66        50
 DarrenSchuettler       0.22      0.26      0.24        50
      DavidLawder       0.36      0.20      0.26        50
    EdnaFernandes       0.59      0.34      0.43        50
      EricAuchard       0.53      0.38      0.44        50
   FumikoFujisaki       0.85      1.00      0.92        50
   GrahamEarnshaw       0.70      0.88      0.78        50
 HeatherScoffield       0.32      0.42      0.36        50
       JanLopatka       0.55      0.56      0.55        50
    JaneMacartney       0.35      0.12      0.18        50
     JimGilchrist       0.85      1.00      0.92       

In [None]:

# vectorize x_train and x_test from text to matrix
vect = TfidfVectorizer(stop_words='english')
x_train_vec = vect.fit_transform(X_train)
x_test_vec = vect.transform(X_test)

# use logistical regression
model = LogisticRegression()
model.fit(x_train_vec, y_train)

# predict and check accuracy
pred = model.predict(x_test_vec)
rep = classification_report(y_test, pred)

In [None]:
# get name of directories, authors (these will be the labels)
train_sub = [name for name in os.listdir(train_dir) if os.path.isdir(os.path.join(train_dir, name))]
label_lst = np.copy(train_sub)

if debug:
    print(train_dir)
    print(label_lst)

# setup the initial empty variables
train = []
train_v = []
label = []

# load the input data from C50train directory and process it

# format will be something like this
# ===================================
# | Label         | Train           |
# ===================================
# | AaronPressman | 2537newsML.txt  |
# ===================================
# | AaronPressman | 14014newsML.txt |
# ===================================
# | ...           | ...             |
# ===================================
# | AlanCrosby    | 10306newsML.txt |
# ===================================
# | ...           | ...             |


auth_idx = 0

# go within the author directory to get list of the file names, this will be the training data
for i in train_sub:
    sub2_dir  = '../C50train/' + i
    train_sub2 = [name for name in os.listdir(sub2_dir) if os.path.isfile(os.path.join(sub2_dir, name))]

    #if debug:
    #    print(sub2_dir)
    #    print(train_sub2)

    # in each author file, save the author as the label and the text as its training data
    for j in train_sub2:
        sub3  = '../C50train/' + i + '/' + j

        with open(sub3, 'r') as file:
            data = file.read()
            data_no_nw = data.replace('\n', '').replace('\r', '')
            train.append(data_no_nw)

        # append author index as label
        label.append(auth_idx)

    # increment author index
    auth_idx = auth_idx + 1

        #if debug:
        #    print(sub3)

if debug:
    print(np.shape(train))
    print(np.shape(label))

    # bin count looking at label
    unused, idx = np.unique(label, return_counts=True)
    #print(unused)
    print(idx)

    print(train[0])
    print(label[0])
    #print(label)

../C50train
['AaronPressman' 'AlanCrosby' 'AlexanderSmith' 'BenjaminKangLim'
 'BernardHickey' 'BradDorfman' 'DarrenSchuettler' 'DavidLawder'
 'EdnaFernandes' 'EricAuchard' 'FumikoFujisaki' 'GrahamEarnshaw'
 'HeatherScoffield' 'JaneMacartney' 'JanLopatka' 'JimGilchrist' 'JoeOrtiz'
 'JohnMastrini' 'JonathanBirt' 'JoWinterbottom' 'KarlPenhaul' 'KeithWeir'
 'KevinDrawbaugh' 'KevinMorrison' 'KirstinRidley' 'KouroshKarimkhany'
 'LydiaZajc' "LynneO'Donnell" 'LynnleyBrowning' 'MarcelMichelson'
 'MarkBendeich' 'MartinWolk' 'MatthewBunce' 'MichaelConnor' 'MureDickie'
 'NickLouth' 'PatriciaCommins' 'PeterHumphrey' 'PierreTran' 'RobinSidel'
 'RogerFillion' 'SamuelPerry' 'SarahDavison' 'ScottHillis' 'SimonCowell'
 'TanEeLyn' 'TheresePoletti' 'TimFarrand' 'ToddNissen' 'WilliamKazer']
(2500,)
(2500,)
[50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50
 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50
 50 50]
The Internet may be overflowing with new technolo

In [None]:
# split the test data
X_train, X_test, y_train, y_test = train_test_split(train, label, test_size=0.1, random_state=1)

# vectorize x_train and x_test from text to matrix
vect = TfidfVectorizer(stop_words='english')
x_train_vec = vect.fit_transform(X_train)
x_test_vec = vect.transform(X_test)

# use logistical regression
model = LogisticRegression()
model.fit(x_train_vec, y_train)

# predict and check accuracy
pred = model.predict(x_test_vec)
rep = classification_report(y_test, pred)

if debug:
    print(rep)

              precision    recall  f1-score   support

           0       0.57      0.80      0.67         5
           1       1.00      1.00      1.00         5
           2       0.75      0.50      0.60         6
           3       0.40      0.67      0.50         3
           4       0.60      0.75      0.67         4
           5       1.00      0.71      0.83         7
           6       1.00      0.80      0.89         5
           7       0.50      0.50      0.50         2
           8       0.56      1.00      0.71         5
           9       1.00      0.40      0.57         5
          10       0.90      1.00      0.95         9
          11       0.88      0.88      0.88         8
          12       1.00      1.00      1.00         5
          13       0.70      1.00      0.82         7
          14       1.00      0.60      0.75         5
          15       1.00      1.00      1.00         6
          16       0.60      0.75      0.67         4
          17       0.60    

In [None]:
# Load in C50test located ../C50test/
test_dir = '../C50test'
# get name of directories, authors (these will be the labels)
test_sub = [name for name in os.listdir(test_dir) if os.path.isdir(os.path.join(test_dir, name))]
test_lst = np.copy(train_sub)

if debug:
    print(test_dir)
    print(test_lst)

# setup the initial empty variables
test       = []
test_label = []

# load the input data from C50test directory and process it

auth_idx = 0

# go within the author directory to get list of the file names, this will be the training data
for i in train_sub:
    sub2_dir  = '../C50test/' + i
    test_sub2 = [name for name in os.listdir(sub2_dir) if os.path.isfile(os.path.join(sub2_dir, name))]

    #if debug:
    #    print(sub2_dir)
    #    print(train_sub2)

    # in each author file, save the text as its test data
    for j in test_sub2:
        sub3  = '../C50test/' + i + '/' + j

        with open(sub3, 'r') as file:
            data = file.read()
            data_no_nw = data.replace('\n', '').replace('\r', '')
            test.append(data_no_nw)

        test_label.append(auth_idx)

    auth_idx = auth_idx + 1

if debug:
    print(np.shape(test))

../C50test
['AaronPressman' 'AlanCrosby' 'AlexanderSmith' 'BenjaminKangLim'
 'BernardHickey' 'BradDorfman' 'DarrenSchuettler' 'DavidLawder'
 'EdnaFernandes' 'EricAuchard' 'FumikoFujisaki' 'GrahamEarnshaw'
 'HeatherScoffield' 'JaneMacartney' 'JanLopatka' 'JimGilchrist' 'JoeOrtiz'
 'JohnMastrini' 'JonathanBirt' 'JoWinterbottom' 'KarlPenhaul' 'KeithWeir'
 'KevinDrawbaugh' 'KevinMorrison' 'KirstinRidley' 'KouroshKarimkhany'
 'LydiaZajc' "LynneO'Donnell" 'LynnleyBrowning' 'MarcelMichelson'
 'MarkBendeich' 'MartinWolk' 'MatthewBunce' 'MichaelConnor' 'MureDickie'
 'NickLouth' 'PatriciaCommins' 'PeterHumphrey' 'PierreTran' 'RobinSidel'
 'RogerFillion' 'SamuelPerry' 'SarahDavison' 'ScottHillis' 'SimonCowell'
 'TanEeLyn' 'TheresePoletti' 'TimFarrand' 'ToddNissen' 'WilliamKazer']
(2500,)


In [None]:
new_test_vec = vect.transform(test)

new_pred = model.predict(new_test_vec)

rep = classification_report(test_label, new_pred)

if debug:
    print(new_pred[0:999])
    print(rep)

[ 0  0 41  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0 44  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 41  0  0
  5  0  1  1  1  1 17  1 14 17  1 21 17 17  1 14 14  1 14 14  1  1  1 17
  1 17 17  1 17 17 17 17 17 17  1  1  1  1 14  1 14  1  1 17 17 17 17  1
  1  1  1  1  2 16  2 16 16 16 16 16 16 16  2 16  2 16 16 16 16 16 16 16
 16 16  2 16 16 16 16  2 44  2 44  2 16 16  0  2 16  2 16 16 16 16 16 16
 16 18  2  2 16  2 13 37  3 49 11 27 11 49  3 13  3  3 43  3 37 43 13 13
 13  3 49 13 13  3 37  3  3 49 49 49 43 43 13 13 34 34 34 34 34 34 34 34
 34 13 49 34  3 49 43 43  4  4  4  4  4  4  4  4  4 23  4  5  5 23 23 30
 23  4 30 23 30 23 23 23  4  4  4  4  4 23  4 23  4  4 23 23 23 23 23  4
  4  4  4 23 23 23 23 23 23 30  5  5  5  5  5  5 41  5  5  5  5  9  5  5
 39 39  5 22  5  5  5  5 22  5  5  5  5  9  5  5  5  5  5  5  5  5  5  5
  5  5  5  5  5  5 35  5  5 22  5  5 12  6  6  6  6 12  6  6  6 12  6 12
  6 12 12 12 12 12 12 12  6 12 12 12 12 12 12 12  6