<a href="https://colab.research.google.com/github/ShaifaliKhulbe/Masters-Thesis-Authorship-Attributon/blob/main/French_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import nltk
import collections
from nltk import word_tokenize
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from collections import Counter
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, RobustScaler
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
# load data
df = pd.read_csv('200_chunks_French.csv')
test_df = pd.read_csv('Test_200_chunks_French.csv')

def find_top_function_words(data, num):
    
    # tokenize each sentence and create a list of all words
    words = []
    for sentence in data['chunks']:
        tokens = nltk.word_tokenize(sentence.lower(), language='french')
        words.extend(tokens)

    # count the frequency of each word in the entire CSV
    word_freq = Counter(words)
    
    # sort the dictionary by frequency in descending order
    sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

    # extract the 100 most frequent stopwords
    
    top_stop_words = [word[0] for word in sorted_word_freq if word[0] in nltk.corpus.stopwords.words('french')][:100]

    return top_stop_words
       
#NEW    
top_function_words = find_top_function_words(df, 100)


def create_feature_vector(sentence):
   
    # Tokenize the sentence into lowercase words
    tokens = nltk.word_tokenize(sentence.lower(), language='french')

    # Count the occurrences of each word in the sentence
    word_counts = Counter(tokens)

    # Create a feature vector based on the number of occurrences of the top 100 stopwords
    top_function_vector = []
    for stop_word in top_function_words:
        count = word_counts.get(stop_word, 0)
        top_function_vector.append(count)
           
    # count the occurrences of each punctuation mark
    punctuation_marks = ['.', ',', ';', ':', '?', '!', '«', '»', '-', '–', '—', '(', ')', '[', ']', '{', '}', '/', '\\', '...', "'", '"']
    
    punctuation_counts = []
    for mark in punctuation_marks:
        count = sentence.count(mark)
        punctuation_counts.append(count)

    #feature_vector = punctuation_counts + top_function_vector
    
    #return feature_vector

    # create scaler objects for each feature type
    punc_scaler = StandardScaler()
    top_func_scaler = StandardScaler()
    

   # convert to NumPy arrays and reshape to have one column
    punctuation_counts_reshaped = np.array(punctuation_counts).reshape(-1, 1)
    top_function_vector_reshaped = np.array(top_function_vector).reshape(-1, 1)
    
    
    # fit and transform each feature type separately
    punc_counts_scaled = punc_scaler.fit_transform(punctuation_counts_reshaped)
    top_func_scaled = top_func_scaler.fit_transform(top_function_vector_reshaped)
    
    
    # concatenate the scaled features
    feature_vector = np.concatenate((punc_counts_scaled, top_func_scaled), axis=0)
    
    # convert back to a 1D array and return
    return feature_vector.flatten()

# create feature vectors for each sentence
df['features'] = df['chunks'].apply(create_feature_vector)
test_df['features'] = test_df['chunks'].apply(create_feature_vector)

In [None]:
len(test_df['features'][150])

122

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(df['features'], df['Author_name'], test_size=0.3, stratify=df['Author_name'], random_state=42)

X_train = np.array(df['features'].tolist())
X_test = np.array(test_df['features'].tolist())

y_train = df['Author_name']
y_test = test_df['Author_name']

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

le = LabelEncoder()
y_test_encoded = le.fit_transform(y_test)

# create and train the SVM model
svm = SVC(kernel='linear')
svm.fit(X_train, y_train_encoded)

# Make predictions on the test set and calculate evaluation metrics
y_pred = svm.predict(X_test)
accuracy = accuracy_score(y_test_encoded, y_pred)
f1 = f1_score(y_test_encoded, y_pred, average='weighted')
precision = precision_score(y_test_encoded, y_pred, average='weighted')
recall = recall_score(y_test_encoded, y_pred, average='weighted')

# Print evaluation metrics
print("SVM accuracy:", accuracy)
print("SVM F1-score:", f1)
print("SVM Precision:", precision)
print("SVM Recall:", recall)

SVM accuracy: 0.8948756593820648
SVM F1-score: 0.8922813293495424
SVM Precision: 0.8933991646106326
SVM Recall: 0.8948756593820648


In [None]:
print(classification_report(y_test_encoded, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.88      0.89       567
           1       0.62      0.79      0.70       237
           2       0.94      0.88      0.91       736
           3       0.32      0.19      0.24        91
           4       0.97      1.00      0.98      1023

    accuracy                           0.89      2654
   macro avg       0.75      0.75      0.74      2654
weighted avg       0.89      0.89      0.89      2654



In [None]:
SVM accuracy: 0.8948756593820648
SVM F1-score: 0.8922813293495424
SVM Precision: 0.8933991646106326
SVM Recall: 0.8948756593820648


              precision    recall  f1-score   support

           0       0.89      0.88      0.89       567
           1       0.62      0.79      0.70       237
           2       0.94      0.88      0.91       736
           3       0.32      0.19      0.24        91
           4       0.97      1.00      0.98      1023

    accuracy                           0.89      2654
   macro avg       0.75      0.75      0.74      2654
weighted avg       0.89      0.89      0.89      2654