# Final Assignment Seden Canpolat 20070001044

In [1]:
import os
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
import jpype
from jpype import JClass, getDefaultJVMPath, startJVM

In [2]:
# Initialization of Zemberek library for Turkish text processing
zemberek_path = r'C:\Users\Seden\Downloads\zemberek-full.jar'
startJVM(getDefaultJVMPath(), '-ea', f'-Djava.class.path={zemberek_path}')
TurkishMorphology = JClass('zemberek.morphology.TurkishMorphology')
TurkishTokenizer = JClass('zemberek.tokenization.TurkishTokenizer')
morphology = TurkishMorphology.createWithDefaults()
tokenizer = TurkishTokenizer.DEFAULT

In [3]:
# Paths
data_dir = r'C:\Users\Seden\Desktop\SE4475_Final_Project_Seden_Canpolat\finalDataset\makaleler-yazarlar'
performance_results_report_path = r'C:\Users\Seden\Desktop\SE4475_Final_Project_Seden_Canpolat\performance_results_report.csv'

In [4]:
# Preprocessing by removing punctuation and converting to lowercase, tokenizing, and stemming
def preprocess_text(content):
    content = re.sub(r'[^\w\s]', '', content).lower()
    tokens = list(tokenizer.tokenizeToStrings(content))
    stemmed_tokens = []
    for token in tokens:
        analysis = morphology.analyzeAndDisambiguate(token).bestAnalysis()
        if analysis and not analysis[0].isUnknown():
            stemmed_token = str(analysis[0].getLemmas()[0]) 
        else:
            stemmed_token = str(token)
        stemmed_tokens.append(stemmed_token)
    return " ".join(stemmed_tokens)

# Load data
data = []
labels = []

for folder_name in os.listdir(data_dir):
    class_dir = os.path.join(data_dir, folder_name)
    if os.path.isdir(class_dir):
        for file_name in os.listdir(class_dir):
            file_path = os.path.join(class_dir, file_name)
            with open(file_path, 'r') as file:
                content = file.read().strip()
                processed_text = preprocess_text(content)
                data.append(processed_text)
                labels.append(folder_name)

# Create DataFrame for the dataset
df = pd.DataFrame({'text': data, 'label': labels})

# Encode labels
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['label'])

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(
    max_features=10000,  
    ngram_range=(1, 2),  
    sublinear_tf=True    
)

X = tfidf_vectorizer.fit_transform(df['text']).toarray()
y = df['encoded_label'].values

# 5-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

all_metrics = []

best_model = None
best_f1 = -np.inf

for train_idx, test_idx in skf.split(X, y):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Train Logistic Regression
    model = LogisticRegression(max_iter=5000, random_state=42)
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Calculate metrics for each fold
    precision = precision_score(y_test, y_pred, average=None, zero_division=0)
    recall = recall_score(y_test, y_pred, average=None, zero_division=0)
    f1 = f1_score(y_test, y_pred, average=None, zero_division=0)

    # Aggregate metrics
    fold_metrics = pd.DataFrame({
        'Class': label_encoder.classes_,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    })
    all_metrics.append(fold_metrics)

    # Track the best model based on F1-Score
    weighted_f1 = f1_score(y_test, y_pred, average='weighted')
    if weighted_f1 > best_f1:
        best_f1 = weighted_f1
        best_model = model

# Aggregate metrics across folds
final_metrics = pd.concat(all_metrics).groupby('Class').mean()
final_metrics.loc['Average'] = final_metrics.mean()

# Save performance report
final_metrics.to_csv(performance_results_report_path, index=True)
