# Assignment Seden Canpolat 20070001044

In [1]:
import os
import re
import pandas as pd
import numpy as np
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold 
from sklearn.preprocessing import LabelEncoder
import jpype
from jpype import JClass, getDefaultJVMPath, startJVM

In [2]:
# Initialization of Zemberek library for Turkish text processing

zemberek_path = r'C:\Users\Seden\Downloads\zemberek-full.jar'

startJVM(getDefaultJVMPath(), '-ea', f'-Djava.class.path={zemberek_path}')
TurkishMorphology = JClass('zemberek.morphology.TurkishMorphology')
TurkishTokenizer = JClass('zemberek.tokenization.TurkishTokenizer')
morphology = TurkishMorphology.createWithDefaults()
tokenizer = TurkishTokenizer.DEFAULT

In [3]:
data_dir = r'C:\Users\Seden\Desktop\SE4475_Assignment\Assignment-data\raw_texts'

class_labels = {"1": "Positive", "2": "Negative", "3": "Neutral"}

# Preprocessing tweets by removing punctuation and converting to lowercase, tokenizing, and stemming
data = []

for folder_name, label in class_labels.items():
    class_dir = os.path.join(data_dir, folder_name)
    for file_name in os.listdir(class_dir):
        file_path = os.path.join(class_dir, file_name)
        with open(file_path, "r") as file:
            content = file.read().strip()
        
        if content:
            text = re.sub(r'[^\w\s]', '', content).lower()
            
            tokens = list(tokenizer.tokenizeToStrings(text))
            stemmed_tokens = []

            for token in tokens:
                analysis = morphology.analyzeAndDisambiguate(token).bestAnalysis()
                if analysis and not analysis[0].isUnknown():
                    stemmed_token = analysis[0].getLemmas()[0]
                else:
                    stemmed_token = token
                stemmed_tokens.append(str(stemmed_token))
            
            data.append((stemmed_tokens, label))

# Creating the DataFrame for processed tokens and saving them to CSV
df = pd.DataFrame(data, columns=["processed_tokens", "class_label"])

output_path_tokenized = r"C:\Users\Seden\Desktop\SE4475_Assignment\all_tweets_tokenized.csv"
df.to_csv(output_path_tokenized, index=False, encoding="utf-8")


In [4]:
# Preparing the data with encoding labels
label_encoder = LabelEncoder()
df['numeric_class'] = label_encoder.fit_transform(df['class_label'])

# Computing the TF-IDF matrix
tfidf_vectorizer = TfidfVectorizer(
    lowercase=False,       # Already lowercased 
    tokenizer=lambda x:x,  # Using tokens directly 
    preprocessor=None,
    token_pattern=None
)

tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_tokens'])

feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# Adding class labels and saving TF-IDF results
tfidf_df['Class'] = df['numeric_class']

output_path_tfidf = r"C:\Users\Seden\Desktop\SE4475_Assignment\tf_idf_values.csv"
tfidf_df.to_csv(output_path_tfidf, index=False, encoding="utf-8")

In [5]:
# Initialization of Stratified 10-Fold Cross-Validation 
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

X = tfidf_df.iloc[:, :-1].values  
y = tfidf_df['Class'].values  

k_values = range(1, 30)  

num_classes = len(np.unique(y)) 

In [6]:
# Evaluating k-NN performance with different k values
for k in k_values:
    fold_tp = np.zeros(num_classes)  # True Positives 
    fold_fp = np.zeros(num_classes)  # False Positives 
    fold_fn = np.zeros(num_classes)  # False Negatives 

    # Cross-validation
    for train_idx, test_idx in skf.split(X, y):
        train_X, test_X = X[train_idx], X[test_idx]
        train_y, test_y = y[train_idx], y[test_idx]

        # Computing cosine similarities for k-NN classification
        dot_products = np.dot(test_X, train_X.T)  
        test_norms = np.linalg.norm(test_X, axis=1, keepdims=True) 
        train_norms = np.linalg.norm(train_X, axis=1, keepdims=True)
        train_norms = train_norms.T
        cosine_similarities = dot_products / (test_norms * train_norms + 1e-8) 

        # Predicting labels using k-NN with majority voting
        predictions = []
        for i, similarity_vector in enumerate(cosine_similarities):
            sorted_indices = np.argsort(similarity_vector)[::-1][:k]  
            nearest_labels = train_y[sorted_indices]
            predicted_class = np.bincount(nearest_labels).argmax()
            predictions.append(predicted_class)

        # Calculating performance metrics for each class
        for clas in range(num_classes):
            fold_tp[clas] += sum((test_y == clas) & (np.array(predictions) == clas))  # True Positives
            fold_fp[clas] += sum((test_y != clas) & (np.array(predictions) == clas))  # False Positives
            fold_fn[clas] += sum((test_y == clas) & (np.array(predictions) != clas))  # False Negatives


In [7]:
best_macro_f1 = -np.inf

# Calculating precision, recall, and F1-score for each class
precision = fold_tp / (fold_tp + fold_fp + 1e-8)
recall = fold_tp / (fold_tp + fold_fn + 1e-8)
f1_score = 2 * (precision * recall) / (precision + recall + 1e-8)

# Calculating macro
macro_precision = np.mean(precision)
macro_recall = np.mean(recall)
macro_f1 = np.mean(f1_score)

# Calculating micro
total_tp = np.sum(fold_tp)
total_fp = np.sum(fold_fp)
total_fn = np.sum(fold_fn)

micro_precision = total_tp / (total_tp + total_fp + 1e-8)
micro_recall = total_tp / (total_tp + total_fn + 1e-8)
micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall + 1e-8)

# Tracking and storing the best k based on macro F1-score 
if macro_f1 > best_macro_f1:
    best_k = k
    best_macro_f1 = macro_f1
    best_report = {
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score,
        "macro_precision": macro_precision,
        "macro_recall": macro_recall,
        "macro_f1": macro_f1,
        "micro_precision": micro_precision,
        "micro_recall": micro_recall,
        "micro_f1": micro_f1,
        "tp": fold_tp,
        "fp": fold_fp,
        "fn": fold_fn,
    }

In [8]:
csv_output_path = r"C:\Users\Seden\Desktop\SE4475_Assignment\knn_results_report.csv"

print(f"\nBest results of k-NN obtained by: k = {best_k}, similarity metric: Cosine Similarity\n")

header = ["Metric", "Class 1", "Class 2", "Class 3", "MACRO Average", "MICRO Average"]
print("{:<25} {:<10} {:<10} {:<10} {:<15} {:<15}".format(*header))

# Opening the CSV file for writing
with open(csv_output_path, mode='w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Best results of k-NN obtained by:", f"k = {best_k}", "Similarity metric: Cosine Similarity"])
    writer.writerow(header)

    metrics = ["precision", "recall", "f1_score"]
    
    def get_average_key(metric, average_type):
        key = f"{average_type}_{metric}"
        return best_report.get(key, None)
    
    # Printing and saving rows for Precision, Recall, and F1-Score
    for metric in metrics:
        metric_key = metric
        macro_avg = get_average_key(metric_key, "macro")
        micro_avg = get_average_key(metric_key, "micro")

        # Calculating macro average and micro average
        if macro_avg is None and metric_key in best_report:
            macro_avg = np.mean([best_report[metric_key][0], best_report[metric_key][1], best_report[metric_key][2]])

        if micro_avg is None and metric_key in best_report:
            total_tp = sum(best_report['tp'])
            total_fn = sum(best_report['fn'])
            micro_avg = total_tp / (total_tp + total_fn)

        # Writing results to console and CSV
        row = [
            metric.capitalize(),
            f"{best_report[metric_key][0]:.4f}",  
            f"{best_report[metric_key][1]:.4f}",  
            f"{best_report[metric_key][2]:.4f}",  
            f"{macro_avg:.4f}" if macro_avg is not None else "N/A",  
            f"{micro_avg:.4f}" if micro_avg is not None else "N/A",  
        ]
        print("{:<25} {:<10} {:<10} {:<10} {:<15} {:<15}".format(*row))
        writer.writerow(row)
      

    # Writing TP, FP, FN counts
    counts = ["True Positives", "False Positives", "False Negatives"]
    count_keys = ["tp", "fp", "fn"]
    for i, count in enumerate(counts):
        row = [
            count,
            int(best_report[count_keys[i]][0]),  
            int(best_report[count_keys[i]][1]),  
            int(best_report[count_keys[i]][2]),  
            "-", 
            "-",  
        ]
        print("{:<25} {:<10} {:<10} {:<10} {:<15} {:<15}".format(*row))
        writer.writerow(row)


Best results of k-NN obtained by: k = 29, similarity metric: Cosine Similarity

Metric                    Class 1    Class 2    Class 3    MACRO Average   MICRO Average  
Precision                 0.5780     0.5007     0.5030     0.5272          0.5435         
Recall                    0.7366     0.3588     0.4484     0.5146          0.5435         
F1_score                  0.6478     0.4180     0.4741     0.5133          0.5435         
True Positives            948        343        339        -               -              
False Positives           692        342        335        -               -              
False Negatives           339        613        417        -               -              
