<a href="https://colab.research.google.com/github/ShaifaliKhulbe/Masters-Thesis-Authorship-Attributon/blob/main/English_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import nltk
import collections
from nltk import word_tokenize
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from collections import Counter
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, RobustScaler, QuantileTransformer, PowerTransformer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
# load data
df = pd.read_csv('200_chunks_english.csv')
test_df = pd.read_csv('Test_200_chunks_english.csv')

def find_top_function_words(data, num):
    
    # tokenize each sentence and create a list of all words
    words = []
    for sentence in data['chunks']:
        tokens = nltk.word_tokenize(sentence.lower())
        words.extend(tokens)

    # count the frequency of each word in the entire CSV
    word_freq = Counter(words)
    
    # sort the dictionary by frequency in descending order
    sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

    # extract the 100 most frequent stopwords
    
    top_stop_words = [word[0] for word in sorted_word_freq if word[0] in nltk.corpus.stopwords.words('english')][:100]

    return top_stop_words
       
#NEW    
top_function_words = find_top_function_words(df, 100)


def create_feature_vector(sentence):
   
    # Tokenize the sentence into lowercase words
    tokens = nltk.word_tokenize(sentence.lower())

    # Count the occurrences of each word in the sentence
    word_counts = Counter(tokens)

    # Create a feature vector based on the number of occurrences of the top 100 stopwords
    top_function_vector = []
    for stop_word in top_function_words:
        count = word_counts.get(stop_word, 0)
        top_function_vector.append(count)
           
    # count the occurrences of each punctuation mark

    punctuation_marks = ['.', ',', ';', ':', '?', '!', '«', '»', '-', '–', '—', '(', ')', '[', ']', '{', '}', '/', '\\', '...', "'", '"']
    
    punctuation_counts = []
    for mark in punctuation_marks:
        count = sentence.count(mark)
        punctuation_counts.append(count)



    #feature_vector = punctuation_counts + top_function_vector
    
    #return feature_vector

    # create scaler objects for each feature type
    punc_scaler = StandardScaler()
    top_func_scaler = StandardScaler()
    

   # convert to NumPy arrays and reshape to have one column
    punctuation_counts_reshaped = np.array(punctuation_counts).reshape(-1, 1)
    top_function_vector_reshaped = np.array(top_function_vector).reshape(-1, 1)
    
    
    # fit and transform each feature type separately
    punc_counts_scaled = punc_scaler.fit_transform(punctuation_counts_reshaped)
    top_func_scaled = top_func_scaler.fit_transform(top_function_vector_reshaped)
    
    
    # concatenate the scaled features
    feature_vector = np.concatenate((punc_counts_scaled, top_func_scaled), axis=0)
    
    # convert back to a 1D array and return
    return feature_vector.flatten()

# create feature vectors for each sentence
df['features'] = df['chunks'].apply(create_feature_vector)
test_df['features'] = test_df['chunks'].apply(create_feature_vector)

In [None]:
len(test_df['features'][150])

122

In [None]:
X_train = np.array(df['features'].tolist())
X_test = np.array(test_df['features'].tolist())

y_train = df['Author_name']
y_test = test_df['Author_name']

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

le = LabelEncoder()
y_test_encoded = le.fit_transform(y_test)

# create and train the SVM model
svm = SVC(kernel='linear')
svm.fit(X_train, y_train_encoded)

# Make predictions on the test set and calculate evaluation metrics
y_pred = svm.predict(X_test)
accuracy = accuracy_score(y_test_encoded, y_pred)
f1 = f1_score(y_test_encoded, y_pred, average='weighted')
precision = precision_score(y_test_encoded, y_pred, average='weighted')
recall = recall_score(y_test_encoded, y_pred, average='weighted')

# Print evaluation metrics
print("SVM accuracy:", accuracy)
print("SVM F1-score:", f1)
print("SVM Precision:", precision)
print("SVM Recall:", recall)

SVM accuracy: 0.6184668989547039
SVM F1-score: 0.6219457370685341
SVM Precision: 0.6584946470541537
SVM Recall: 0.6184668989547039


In [None]:
print(classification_report(y_test_encoded, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.78      0.74      1638
           1       0.75      0.47      0.58      1876
           2       0.47      0.52      0.49      1382
           3       0.86      0.70      0.77       820
           4       0.40      0.74      0.52       598

    accuracy                           0.62      6314
   macro avg       0.64      0.64      0.62      6314
weighted avg       0.66      0.62      0.62      6314



In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [None]:
# Random Forest
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train_encoded)
random_forest_predictions = random_forest_model.predict(X_test)
random_forest_report = classification_report(y_test_encoded, random_forest_predictions)
print("Random Forest report:")
print(random_forest_report)

# Make predictions on the test set and calculate evaluation metrics
random_forest_predictions = random_forest_model.predict(X_test)
accuracy = accuracy_score(y_test_encoded, random_forest_predictions)
f1 = f1_score(y_test_encoded, random_forest_predictions, average='weighted')
precision = precision_score(y_test_encoded, random_forest_predictions, average='weighted')
recall = recall_score(y_test_encoded, random_forest_predictions, average='weighted')

# Print evaluation metrics
print("random_forest accuracy:", accuracy)
print("random_forest F1-score:", f1)
print("random_forest Precision:", precision)
print("random_forest Recall:", recall)

Random Forest report:
              precision    recall  f1-score   support

           0       0.61      0.64      0.62      1638
           1       0.72      0.35      0.47      1876
           2       0.32      0.30      0.31      1382
           3       0.59      0.52      0.56       820
           4       0.26      0.73      0.38       598

    accuracy                           0.47      6314
   macro avg       0.50      0.51      0.47      6314
weighted avg       0.54      0.47      0.48      6314

random_forest accuracy: 0.47038327526132406
random_forest F1-score: 0.4759087038316926
random_forest Precision: 0.5424677976061084
random_forest Recall: 0.47038327526132406


In [None]:
# Gradient Boosting
gradient_boosting_model = GradientBoostingClassifier()
gradient_boosting_model.fit(X_train, y_train_encoded)
gradient_boosting_predictions = gradient_boosting_model.predict(X_test)
gradient_boosting_report = classification_report(y_test_encoded, gradient_boosting_predictions)
print("Gradient Boosting report:")
print(gradient_boosting_report)

# Make predictions on the test set and calculate evaluation metrics
accuracy = accuracy_score(y_test_encoded, gradient_boosting_predictions)
f1 = f1_score(y_test_encoded, gradient_boosting_predictions, average='weighted')
precision = precision_score(y_test_encoded, gradient_boosting_predictions, average='weighted')
recall = recall_score(y_test_encoded, gradient_boosting_predictions, average='weighted')

# Print evaluation metrics
print("random_forest accuracy:", accuracy)
print("random_forest F1-score:", f1)
print("random_forest Precision:", precision)
print("random_forest Recall:", recall)

Gradient Boosting report:
              precision    recall  f1-score   support

           0       0.65      0.72      0.68      1638
           1       0.74      0.33      0.46      1876
           2       0.39      0.40      0.40      1382
           3       0.72      0.65      0.68       820
           4       0.31      0.79      0.45       598

    accuracy                           0.53      6314
   macro avg       0.56      0.58      0.53      6314
weighted avg       0.60      0.53      0.53      6314

random_forest accuracy: 0.5315172632245803
random_forest F1-score: 0.5309830757268973
random_forest Precision: 0.5962204413630282
random_forest Recall: 0.5315172632245803


In [None]:
# k-Nearest Neighbors (k-NN)
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train_encoded)
knn_predictions = knn_model.predict(X_test)
knn_report = classification_report(y_test_encoded, knn_predictions)
print("k-Nearest Neighbors (k-NN) report:")
print(knn_report)


# Make predictions on the test set and calculate evaluation metrics
accuracy = accuracy_score(y_test_encoded, knn_predictions)
f1 = f1_score(y_test_encoded, knn_predictions, average='weighted')
precision = precision_score(y_test_encoded, knn_predictions, average='weighted')
recall = recall_score(y_test_encoded, knn_predictions, average='weighted')

# Print evaluation metrics
print("random_forest accuracy:", accuracy)
print("random_forest F1-score:", f1)
print("random_forest Precision:", precision)
print("random_forest Recall:", recall)

k-Nearest Neighbors (k-NN) report:
              precision    recall  f1-score   support

           0       0.68      0.59      0.63      1638
           1       0.52      0.31      0.39      1876
           2       0.39      0.35      0.37      1382
           3       0.45      0.76      0.57       820
           4       0.34      0.67      0.45       598

    accuracy                           0.48      6314
   macro avg       0.48      0.54      0.48      6314
weighted avg       0.51      0.48      0.48      6314

random_forest accuracy: 0.48400380107697183
random_forest F1-score: 0.47722683889269
random_forest Precision: 0.5095014501293667
random_forest Recall: 0.48400380107697183


In [None]:
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

#MLP (Multi-Layer Perceptron)
mlp_model = MLPClassifier()
mlp_model.fit(X_train, y_train_encoded)
mlp_predictions = mlp_model.predict(X_test)
mlp_report = classification_report(y_test_encoded, mlp_predictions)
print("MLP (Multi-Layer Perceptron) report:")
print(mlp_report)

#Make predictions on the test set and calculate evaluation metrics
accuracy = accuracy_score(y_test_encoded, mlp_predictions)
f1 = f1_score(y_test_encoded, mlp_predictions, average='weighted')
precision = precision_score(y_test_encoded, mlp_predictions, average='weighted')
recall = recall_score(y_test_encoded, mlp_predictions, average='weighted')

#Print evaluation metrics
print("MLP accuracy:", accuracy)
print("MLP F1-score:", f1)
print("MLP Precision:", precision)
print("MLP Recall:", recall)

MLP (Multi-Layer Perceptron) report:
              precision    recall  f1-score   support

           0       0.70      0.79      0.74      1638
           1       0.72      0.46      0.56      1876
           2       0.48      0.45      0.46      1382
           3       0.77      0.72      0.74       820
           4       0.36      0.73      0.49       598

    accuracy                           0.60      6314
   macro avg       0.61      0.63      0.60      6314
weighted avg       0.63      0.60      0.60      6314

MLP accuracy: 0.6016788089958822
MLP F1-score: 0.6027073808559634
MLP Precision: 0.633569742690812
MLP Recall: 0.6016788089958822




In [None]:
Random Forest:

random_forest accuracy: 0.47038327526132406
random_forest F1-score: 0.4759087038316926
random_forest Precision: 0.5424677976061084
random_forest Recall: 0.47038327526132406


Gradient Boosting:


random_forest accuracy: 0.5315172632245803
random_forest F1-score: 0.5309830757268973
random_forest Precision: 0.5962204413630282
random_forest Recall: 0.5315172632245803



k-Nearest Neighbors (k-NN):


random_forest accuracy: 0.48400380107697183
random_forest F1-score: 0.47722683889269
random_forest Precision: 0.5095014501293667
random_forest Recall: 0.48400380107697183


MLP (Multi-Layer Perceptron):


MLP accuracy: 0.6016788089958822
MLP F1-score: 0.6027073808559634
MLP Precision: 0.633569742690812
MLP Recall: 0.6016788089958822

SVM: 

SVM accuracy: 0.6184668989547039
SVM F1-score: 0.6219457370685341
SVM Precision: 0.6584946470541537
SVM Recall: 0.6184668989547039


In [None]:
SVM (best results)

SVM accuracy: 0.6184668989547039
SVM F1-score: 0.6219457370685341
SVM Precision: 0.6584946470541537
SVM Recall: 0.6184668989547039


              precision    recall  f1-score   support

           0       0.70      0.78      0.74      1638
           1       0.75      0.47      0.58      1876
           2       0.47      0.52      0.49      1382
           3       0.86      0.70      0.77       820
           4       0.40      0.74      0.52       598

    accuracy                           0.62      6314
   macro avg       0.64      0.64      0.62      6314
weighted avg       0.66      0.62      0.62      6314



