Project: Authorship Identification
======

In [1]:
# Step 1: Import necessary libraries
import os
import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

In [2]:
# setup debug for prints troubleshooting
# debug = True
debug = False

In [3]:
# Load in C50train located ../C50train/
train_dir = 'C:/Users/esthe/Downloads/CMPE 255 Project/cmpe255-project/C50train/'
# get name of directories, authors (these will be the labels)
train_sub = [name for name in os.listdir(train_dir) if os.path.isdir(os.path.join(train_dir, name))]
label_lst = np.copy(train_sub)

if debug:
    print(train_dir)
    print(label_lst)

# setup the initial empty variables
train = []
train_v = []
label = []

# load the input data from C50train directory and process it
auth_idx = 0

# go within the author directory to get list of the file names, this will be the training data
for i in train_sub:
    sub2_dir  = 'C:/Users/esthe/Downloads/CMPE 255 Project/cmpe255-project/C50train/' + i 
#     '../C50train/'
    train_sub2 = [name for name in os.listdir(sub2_dir) if os.path.isfile(os.path.join(sub2_dir, name))]

    #if debug:
    #    print(sub2_dir)
    #    print(train_sub2)
        
    # in each author file, save the author as the label and the text as its training data
    for j in train_sub2:
        sub3  = 'C:/Users/esthe/Downloads/CMPE 255 Project/cmpe255-project/C50train/' + i + '/' + j

        with open(sub3, 'r') as file:
            data = file.read()
            data_no_nw = data.replace('\n', '').replace('\r', '')
            train.append(data_no_nw)

        # append author index as label
        label.append(auth_idx)

    # increment author index
    auth_idx = auth_idx + 1
        
        #if debug:
        #    print(sub3)

if debug:
    print(np.shape(train))
    print(np.shape(label))

    # bin count looking at label
    unused, idx = np.unique(label, return_counts=True)
    #print(unused)
    print(idx)

    print(train[0])
    print(label[0])
    #print(label)

## Step 2: Vectorize/Transform Data

In [4]:
# Step 2: Convert text to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(
#     max_features=10000,  # Limit to the top 10,000 features for performance
    stop_words='english',  # Remove common English stopwords
    lowercase=True,  # Convert text to lowercase
    )

# Transform text data into TF-IDF feature vectors
X = tfidf_vectorizer.fit_transform(train)
y = np.array(label)

print(f"TF-IDF Feature Shape: {X.shape}")

TF-IDF Feature Shape: (2500, 29216)


In [None]:
from scipy.sparse import csr_matrix as sp_csr_matrix
def normalize_csr_matrix(csr_matrix):
    norm_data = []
    norm_rows = []
    norm_cols = []
    
    for i in range(csr_matrix.shape[0]):
        row = csr_matrix[i].toarray().flatten()  # Convert row to a dense array and flatten it
        norm = np.linalg.norm(row)  # Compute the L2 norm (Euclidean norm) of the row
        
        # If the norm is not zero, normalize the row
        if norm > 0:
            norm_row = row / norm
            # Collect non-zero entries
            for j in range(len(norm_row)):
                if norm_row[j] != 0:
                    norm_data.append(norm_row[j])
                    norm_rows.append(i)  # Row index
                    norm_cols.append(j)  # Column index

    # Create a new sparse matrix from the normalized data
    normalized_csr_matrix = sp_csr_matrix((norm_data, (norm_rows, norm_cols)), 
                                           shape=csr_matrix.shape)
    return normalized_csr_matrix

X = normalize_csr_matrix(X)
# y = normalize_csr_matrix(y)


## Step 3:

In [7]:
# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Train Set Shape: {X_train.shape}, Test Set Shape: {X_test.shape}")

Train Set Shape: (2000, 29216), Test Set Shape: (500, 29216)


## Step 4-5: Dimensionality Reduction + Grid Search

In [5]:
# Step 4: Dimensionality Reduction with Truncated SVD
svd = TruncatedSVD(n_components=1750, random_state=42) 

# Step 4: Dimensionality Reduction with PCA
# pca = PCA(n_components=1500, random_state=42)  

# Normalize the data
normalizer = Normalizer(norm='l2',copy=False)

# # Scale the data 
# scaler = StandardScaler()

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Adding class weights to handle imbalance
logreg = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')

# Hyperparameter tuning
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}  # Regularization strength
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='f1_weighted')
grid_search.fit(X_train, y_train)

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.83      0.77         6
           1       0.91      1.00      0.95        10
           2       0.67      0.86      0.75         7
           3       0.58      0.64      0.61        11
           4       1.00      0.71      0.83        14
           5       0.67      0.91      0.77        11
           6       0.83      1.00      0.91         5
           7       0.86      1.00      0.92         6
           8       0.83      0.62      0.71        16
           9       0.92      0.86      0.89        14
          10       0.91      1.00      0.95        10
          11       1.00      1.00      1.00         8
          12       0.90      0.82      0.86        11
          13       0.58      0.70      0.64        10
          14       0.67      0.80      0.73         5
          15       0.89      1.00      0.94         8
          16       1.00      0.62      0.76        13
    

In [15]:
best_model

In [17]:
# Step 5: Create a Pipeline with Dimensionality Reduction and Classifier
pipeline = Pipeline([
    ('svd', svd),
#     ('pca', pca),
    ('normalize', normalizer),
    ('classifier', LogisticRegression(C=100, class_weight='balanced', max_iter=1000,random_state=42)),
#     ('classifier', LogisticRegression(max_iter=500, random_state=42)),
#     ('classifier', RandomForestClassifier(n_estimators=200, random_state=42)),  # Random Forest with class balancing
#     ('classifier', SVC(kernel='rbf', class_weight='balanced', random_state=42)),
    ])

# Train the Model
pipeline.fit(X_train, y_train)

# Evaluate the Model
y_pred = pipeline.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.83      0.77         6
           1       0.91      1.00      0.95        10
           2       0.67      0.86      0.75         7
           3       0.58      0.64      0.61        11
           4       1.00      0.71      0.83        14
           5       0.67      0.91      0.77        11
           6       1.00      1.00      1.00         5
           7       0.86      1.00      0.92         6
           8       0.83      0.62      0.71        16
           9       1.00      0.86      0.92        14
          10       0.91      1.00      0.95        10
          11       1.00      1.00      1.00         8
          12       0.90      0.82      0.86        11
          13       0.58      0.70      0.64        10
          14       0.67      0.80      0.73         5
          15       0.89      1.00      0.94         8
          16       1.00      0.54      0.70        13
    

## Step 6:

In [18]:
# Step 6: Calculate F1 Score
f1 = f1_score(y_test, y_pred, average='macro')
print(f"Weighted F1 Score: {f1:.4f}")

Weighted F1 Score: 0.8138


In [19]:
# Analyze the Dimensionality Reduction
explained_variance_svd = svd.explained_variance_.sum()
print(f"Total Explained Variance by SVD: {explained_variance_svd:.4f}")

# explained_variance_pca = pca.explained_variance_.sum()
# print(f"Total Explained Variance by PCA: {explained_variance_pca:.4f}")

# # Optional Visualization of Explained Variance
# plt.plot(np.cumsum(svd.explained_variance_))
# plt.title('Cumulative Explained Variance by SVD Components')
# plt.xlabel('Number of Components')
# plt.ylabel('Cumulative Explained Variance')
# plt.grid()
# plt.show()

Total Explained Variance by SVD: 0.9611


## Step 7: 

In [20]:
# Load in C50test located ../C50test/
test_dir = 'C:/Users/esthe/Downloads/CMPE 255 Project/cmpe255-project/C50test/'
# '../C50test'

# get name of directories, authors (these will be the labels)
test_sub = [name for name in os.listdir(test_dir) if os.path.isdir(os.path.join(test_dir, name))]
test_lst = np.copy(train_sub)

if debug:
    print(test_dir)
    print(test_lst)

# setup the initial empty variables
test       = []
test_label = []

# load the input data from C50test directory and process it

auth_idx = 0

# go within the author directory to get list of the file names, this will be the training data
for i in train_sub:
    sub2_dir  = 'C:/Users/esthe/Downloads/CMPE 255 Project/cmpe255-project/C50test/' + i 
    test_sub2 = [name for name in os.listdir(sub2_dir) if os.path.isfile(os.path.join(sub2_dir, name))]

    #if debug:
    #    print(sub2_dir)
    #    print(train_sub2)
        
    # in each author file, save the text as its test data
    for j in test_sub2:
        sub3  = 'C:/Users/esthe/Downloads/CMPE 255 Project/cmpe255-project/C50test/' + i + '/' + j

        with open(sub3, 'r') as file:
            data = file.read()
            data_no_nw = data.replace('\n', '').replace('\r', '')
            test.append(data_no_nw)
        
        test_label.append(auth_idx)

    auth_idx = auth_idx + 1

if debug:
    print(np.shape(test))

In [21]:
# Transform Test Data into TF-IDF Features
X_test_new = tfidf_vectorizer.transform(test)  # Use the trained vectorizer on test data
y_test_new = np.array(test_label)

print(f"Transformed Test Feature Shape: {X_test_new.shape}")
# X_test_new = normalize_csr_matrix(X_test_new)

Transformed Test Feature Shape: (2500, 29216)


In [22]:
# Step 2: Predict on Test Data
y_pred_new = pipeline.predict(X_test_new)

# Step 3: Evaluate Model on Test Data
print("Classification Report on Test Data:")
print(classification_report(y_test_new, y_pred_new))

# Optional: Calculate Weighted F1 Score
f1_new = f1_score(y_test_new, y_pred_new, average='macro')
print(f"Weighted F1 Score on Test Data: {f1_new:.4f}")

Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.87      0.96      0.91        50
           1       0.83      0.48      0.61        50
           2       0.52      0.34      0.41        50
           3       0.37      0.22      0.28        50
           4       0.80      0.48      0.60        50
           5       0.61      0.92      0.74        50
           6       0.38      0.28      0.32        50
           7       0.68      0.54      0.60        50
           8       0.85      0.44      0.58        50
           9       0.47      0.52      0.50        50
          10       0.98      1.00      0.99        50
          11       0.73      0.92      0.81        50
          12       0.32      0.38      0.35        50
          13       0.16      0.10      0.12        50
          14       0.62      0.40      0.49        50
          15       0.89      1.00      0.94        50
          16       0.63      0.66      0.65  

SVD: 

    accuracy                           0.62      2500

Weighted F1 Score on Test Data: 


PCA:

    accuracy                           0.61      2500

Weighted F1 Score on Test Data: 0.6059

## Plots

In [None]:
# Define the range of n_components to explore
components_range = [0,500,750,1000,1200,1500]
explained_variances = []

# Fit PCA for each number of components and record the cumulative explained variance
for n in components_range:
    pca = PCA(n_components=n)
    pca.fit(X_train.toarray())  # Convert sparse matrix to dense if necessary
    explained_variances.append(np.sum(pca.explained_variance_))  # Cumulative explained variance

# Plot the explained variance
plt.figure(figsize=(10, 6))
plt.plot(components_range, explained_variances, marker='o', color='b', linestyle='-', markersize=3)
plt.title('Explained Variance vs. Number of PCA Components')
plt.xlabel('Number of Components (n)')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# List of test sizes to evaluate
test_sizes = [0.10,0.15,0.20,0.25,0.30]  # Test sizes from 10% to 50%
f1_scores = []

# Evaluate the model for each test size
for test_size in test_sizes:
    # Split the data
    X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=42
    )
    
    # Train the model
    pipeline.fit(X_train_split, y_train_split)
    
    # Predict and evaluate
    y_pred_split = pipeline.predict(X_test_split)
    f1 = f1_score(y_test_split, y_pred_split, average='weighted')
    f1_scores.append(f1)

    if debug:
        print(f"Test Size: {test_size:.2f}, F1 Score: {f1:.4f}")

# Plot Test Size vs. F1-Score
plt.plot(test_sizes, f1_scores, marker='o', linestyle='-', color='b')
plt.title("Test Size vs. Weighted F1-Score")
plt.xlabel("Test Size")
plt.ylabel("Weighted F1-Score")
plt.grid()
plt.show()


In [None]:
from wordcloud import WordCloud
all_text = " ".join(train)
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Most Frequent Words")
plt.show()


In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=42)
X_embedded = tsne.fit_transform(X[:1000].toarray())  # Use a subset for efficiency
plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=label[:1000], cmap='viridis', s=5)
plt.colorbar()
plt.title("t-SNE Visualization of Feature Space")
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix

# highlight which classes are misclassified
cm = confusion_matrix(y_test_new, y_pred_new)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=False,, fmt='d' cmap='Blues', xticklabels=label_lst, yticklabels=label_lst)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
from sklearn.metrics import classification_report

plt.figure(figsize=(10, 8))
report = classification_report(y_test_new, y_pred_new, output_dict=True)
f1_scores = [report[str(i)]['f1-score'] for i in range(len(label_lst))]
plt.bar(label_lst, f1_scores)
plt.title("F1 Score per Class")
plt.xlabel("Class")
plt.ylabel("F1 Score")
plt.xticks(rotation=90)
plt.show()
